• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
3  * Copyright © 2018 Google, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * Authors:
25  *    Rob Clark <robclark@freedesktop.org>
26  */
27 
28 #define FD_BO_NO_HARDPIN 1
29 
30 #include "pipe/p_state.h"
31 #include "util/bitset.h"
32 #include "util/format/u_format.h"
33 #include "util/u_inlines.h"
34 #include "util/u_memory.h"
35 #include "util/u_string.h"
36 
37 #include "freedreno_program.h"
38 
39 #include "fd6_const.h"
40 #include "fd6_emit.h"
41 #include "fd6_pack.h"
42 #include "fd6_program.h"
43 #include "fd6_texture.h"
44 
45 /**
46  * Temporary program building state.
47  */
48 struct program_builder {
49    struct fd6_program_state *state;
50    struct fd_context *ctx;
51    const struct ir3_cache_key *key;
52    const struct ir3_shader_variant *vs;
53    const struct ir3_shader_variant *hs;
54    const struct ir3_shader_variant *ds;
55    const struct ir3_shader_variant *gs;
56    const struct ir3_shader_variant *fs;
57    const struct ir3_shader_variant *last_shader;
58    bool binning_pass;
59 };
60 
61 static const struct xs_config {
62    uint16_t reg_sp_xs_instrlen;
63    uint16_t reg_hlsq_xs_ctrl;
64    uint16_t reg_sp_xs_first_exec_offset;
65    uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
66 } xs_config[] = {
67    [MESA_SHADER_VERTEX] = {
68       REG_A6XX_SP_VS_INSTRLEN,
69       REG_A6XX_HLSQ_VS_CNTL,
70       REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
71       REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
72    },
73    [MESA_SHADER_TESS_CTRL] = {
74       REG_A6XX_SP_HS_INSTRLEN,
75       REG_A6XX_HLSQ_HS_CNTL,
76       REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
77       REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
78    },
79    [MESA_SHADER_TESS_EVAL] = {
80       REG_A6XX_SP_DS_INSTRLEN,
81       REG_A6XX_HLSQ_DS_CNTL,
82       REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
83       REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
84    },
85    [MESA_SHADER_GEOMETRY] = {
86       REG_A6XX_SP_GS_INSTRLEN,
87       REG_A6XX_HLSQ_GS_CNTL,
88       REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
89       REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
90    },
91    [MESA_SHADER_FRAGMENT] = {
92       REG_A6XX_SP_FS_INSTRLEN,
93       REG_A6XX_HLSQ_FS_CNTL,
94       REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
95       REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
96    },
97    [MESA_SHADER_COMPUTE] = {
98       REG_A6XX_SP_CS_INSTRLEN,
99       REG_A6XX_HLSQ_CS_CNTL,
100       REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
101       REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
102    },
103 };
104 
105 void
fd6_emit_shader(struct fd_context * ctx,struct fd_ringbuffer * ring,const struct ir3_shader_variant * so)106 fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
107                 const struct ir3_shader_variant *so)
108 {
109    if (!so) {
110       /* shader stage disabled */
111       return;
112    }
113 
114 #ifdef DEBUG
115    /* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */
116    const char *name = so->name;
117    if (name)
118       fd_emit_string5(ring, name, strlen(name));
119 #endif
120 
121    gl_shader_stage type = so->type;
122    if (type == MESA_SHADER_COMPUTE)
123       type = MESA_SHADER_COMPUTE;
124 
125    enum a6xx_threadsize thrsz =
126       so->info.double_threadsize ? THREAD128 : THREAD64;
127 
128    switch (type) {
129    case MESA_SHADER_VERTEX:
130       OUT_REG(ring, A6XX_SP_VS_CTRL_REG0(
131                .halfregfootprint = so->info.max_half_reg + 1,
132                .fullregfootprint = so->info.max_reg + 1,
133                .branchstack = ir3_shader_branchstack_hw(so),
134                .mergedregs = so->mergedregs,
135       ));
136       break;
137    case MESA_SHADER_TESS_CTRL:
138       OUT_REG(ring, A6XX_SP_HS_CTRL_REG0(
139                .halfregfootprint = so->info.max_half_reg + 1,
140                .fullregfootprint = so->info.max_reg + 1,
141                .branchstack = ir3_shader_branchstack_hw(so),
142       ));
143       break;
144    case MESA_SHADER_TESS_EVAL:
145       OUT_REG(ring, A6XX_SP_DS_CTRL_REG0(
146                .halfregfootprint = so->info.max_half_reg + 1,
147                .fullregfootprint = so->info.max_reg + 1,
148                .branchstack = ir3_shader_branchstack_hw(so),
149       ));
150       break;
151    case MESA_SHADER_GEOMETRY:
152       OUT_REG(ring, A6XX_SP_GS_CTRL_REG0(
153                .halfregfootprint = so->info.max_half_reg + 1,
154                .fullregfootprint = so->info.max_reg + 1,
155                .branchstack = ir3_shader_branchstack_hw(so),
156       ));
157       break;
158    case MESA_SHADER_FRAGMENT:
159       OUT_REG(ring, A6XX_SP_FS_CTRL_REG0(
160                .halfregfootprint = so->info.max_half_reg + 1,
161                .fullregfootprint = so->info.max_reg + 1,
162                .branchstack = ir3_shader_branchstack_hw(so),
163                .threadsize = thrsz,
164                .varying = so->total_in != 0,
165                .lodpixmask = so->need_full_quad,
166                /* unknown bit, seems unnecessary */
167                .unk24 = true,
168                .pixlodenable = so->need_pixlod,
169                .mergedregs = so->mergedregs,
170       ));
171       break;
172    case MESA_SHADER_COMPUTE:
173       thrsz = ctx->screen->info->a6xx.supports_double_threadsize ? thrsz : THREAD128;
174       OUT_REG(ring, A6XX_SP_CS_CTRL_REG0(
175                .halfregfootprint = so->info.max_half_reg + 1,
176                .fullregfootprint = so->info.max_reg + 1,
177                .branchstack = ir3_shader_branchstack_hw(so),
178                .threadsize = thrsz,
179                .mergedregs = so->mergedregs,
180       ));
181       break;
182    default:
183       unreachable("bad shader stage");
184    }
185 
186    const struct xs_config *cfg = &xs_config[type];
187 
188    OUT_PKT4(ring, cfg->reg_sp_xs_instrlen, 1);
189    OUT_RING(ring, so->instrlen);
190 
191    /* emit program binary & private memory layout
192     */
193 
194    ir3_get_private_mem(ctx, so);
195 
196    uint32_t per_sp_size = ctx->pvtmem[so->pvtmem_per_wave].per_sp_size;
197 
198    fd_ringbuffer_attach_bo(ring, so->bo);
199 
200    OUT_PKT4(ring, cfg->reg_sp_xs_first_exec_offset, 7);
201    OUT_RING(ring, 0);                /* SP_xS_OBJ_FIRST_EXEC_OFFSET */
202    OUT_RELOC(ring, so->bo, 0, 0, 0); /* SP_xS_OBJ_START_LO */
203    OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size));
204    if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */
205       fd_ringbuffer_attach_bo(ring, ctx->pvtmem[so->pvtmem_per_wave].bo);
206       OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0);
207    } else {
208       OUT_RING(ring, 0);
209       OUT_RING(ring, 0);
210    }
211    OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) |
212                      COND(so->pvtmem_per_wave,
213                           A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
214 
215    OUT_PKT4(ring, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
216    OUT_RING(ring, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size));
217 
218    uint32_t shader_preload_size =
219       MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size);
220 
221    enum a6xx_state_block sb = fd6_stage2shadersb(so->type);
222    OUT_PKT7(ring, fd6_stage2opcode(so->type), 3);
223    OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
224                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
225                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
226                      CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
227                      CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
228    OUT_RELOC(ring, so->bo, 0, 0, 0);
229 
230    fd6_emit_immediates(so, ring);
231 }
232 
233 /**
234  * Build a pre-baked state-obj to disable SO, so that we aren't dynamically
235  * building this at draw time whenever we transition from SO enabled->disabled
236  */
237 static void
setup_stream_out_disable(struct fd_context * ctx)238 setup_stream_out_disable(struct fd_context *ctx)
239 {
240    unsigned sizedw = 4;
241 
242    if (ctx->screen->info->a6xx.tess_use_shared)
243       sizedw += 2;
244 
245    struct fd_ringbuffer *ring =
246       fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4);
247 
248    OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw);
249    OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
250    OUT_RING(ring, 0);
251    OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
252    OUT_RING(ring, 0);
253 
254    if (ctx->screen->info->a6xx.tess_use_shared) {
255       OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL);
256       OUT_RING(ring, 0);
257    }
258 
259    fd6_context(ctx)->streamout_disable_stateobj = ring;
260 }
261 
262 static void
setup_stream_out(struct fd_context * ctx,struct fd6_program_state * state,const struct ir3_shader_variant * v,struct ir3_shader_linkage * l)263 setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state,
264                  const struct ir3_shader_variant *v,
265                  struct ir3_shader_linkage *l)
266 {
267    const struct ir3_stream_output_info *strmout = &v->stream_output;
268 
269    /* Note: 64 here comes from the HW layout of the program RAM. The program
270     * for stream N is at DWORD 64 * N.
271     */
272 #define A6XX_SO_PROG_DWORDS 64
273    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
274    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
275 
276    memset(prog, 0, sizeof(prog));
277 
278    for (unsigned i = 0; i < strmout->num_outputs; i++) {
279       const struct ir3_stream_output *out = &strmout->output[i];
280       unsigned k = out->register_index;
281       unsigned idx;
282 
283       /* linkage map sorted by order frag shader wants things, so
284        * a bit less ideal here..
285        */
286       for (idx = 0; idx < l->cnt; idx++)
287          if (l->var[idx].slot == v->outputs[k].slot)
288             break;
289 
290       assert(idx < l->cnt);
291 
292       for (unsigned j = 0; j < out->num_components; j++) {
293          unsigned c = j + out->start_component;
294          unsigned loc = l->var[idx].loc + c;
295          unsigned off = j + out->dst_offset; /* in dwords */
296 
297          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
298          if (loc & 1) {
299             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
300                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
301                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
302          } else {
303             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
304                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
305                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
306          }
307          BITSET_SET(valid_dwords, dword);
308       }
309    }
310 
311    unsigned prog_count = 0;
312    unsigned start, end;
313    BITSET_FOREACH_RANGE (start, end, valid_dwords,
314                          A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
315       prog_count += end - start + 1;
316    }
317 
318    const bool emit_pc_so_stream_cntl =
319          ctx->screen->info->a6xx.tess_use_shared &&
320          v->type == MESA_SHADER_TESS_EVAL;
321 
322    unsigned sizedw = 10 + (2 * prog_count);
323    if (emit_pc_so_stream_cntl)
324       sizedw += 2;
325 
326    struct fd_ringbuffer *ring =
327       fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4);
328 
329    OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw);
330    OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
331    OUT_RING(ring,
332             A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(strmout->streams_written) |
333             COND(strmout->stride[0] > 0,
334                  A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + strmout->output[0].stream)) |
335             COND(strmout->stride[1] > 0,
336                  A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + strmout->output[1].stream)) |
337             COND(strmout->stride[2] > 0,
338                  A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + strmout->output[2].stream)) |
339             COND(strmout->stride[3] > 0,
340                  A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + strmout->output[3].stream)));
341    OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(0));
342    OUT_RING(ring, strmout->stride[0]);
343    OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(1));
344    OUT_RING(ring, strmout->stride[1]);
345    OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(2));
346    OUT_RING(ring, strmout->stride[2]);
347    OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(3));
348    OUT_RING(ring, strmout->stride[3]);
349 
350    bool first = true;
351    BITSET_FOREACH_RANGE (start, end, valid_dwords,
352                          A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
353       OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
354       OUT_RING(ring, COND(first, A6XX_VPC_SO_CNTL_RESET) |
355                      A6XX_VPC_SO_CNTL_ADDR(start));
356       for (unsigned i = start; i < end; i++) {
357          OUT_RING(ring, REG_A6XX_VPC_SO_PROG);
358          OUT_RING(ring, prog[i]);
359       }
360       first = false;
361    }
362 
363    if (emit_pc_so_stream_cntl) {
364       /* Possibly not tess_use_shared related, but the combination of
365        * tess + xfb fails some tests if we don't emit this.
366        */
367       OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL);
368       OUT_RING(ring, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(0x1));
369    }
370 
371    state->streamout_stateobj = ring;
372 }
373 
374 static uint32_t
sp_xs_config(const struct ir3_shader_variant * v)375 sp_xs_config(const struct ir3_shader_variant *v)
376 {
377    if (!v)
378       return 0;
379 
380    return A6XX_SP_VS_CONFIG_ENABLED |
381          COND(v->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
382          COND(v->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
383          COND(v->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
384          COND(v->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
385          A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(v)) |
386          A6XX_SP_VS_CONFIG_NTEX(v->num_samp) |
387          A6XX_SP_VS_CONFIG_NSAMP(v->num_samp);
388 }
389 
390 template <chip CHIP>
391 static void
setup_config_stateobj(struct fd_context * ctx,struct fd6_program_state * state)392 setup_config_stateobj(struct fd_context *ctx, struct fd6_program_state *state)
393 {
394    struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 100 * 4);
395 
396    OUT_REG(ring, HLSQ_INVALIDATE_CMD(CHIP, .vs_state = true, .hs_state = true,
397                                           .ds_state = true, .gs_state = true,
398                                           .fs_state = true, .cs_state = true,
399                                           .cs_ibo = true, .gfx_ibo = true, ));
400 
401    assert(state->vs->constlen >= state->bs->constlen);
402 
403    OUT_REG(ring, HLSQ_VS_CNTL(
404          CHIP,
405          .constlen = state->vs->constlen,
406          .enabled = true,
407    ));
408    OUT_REG(ring, HLSQ_HS_CNTL(
409          CHIP,
410          .constlen = COND(state->hs, state->hs->constlen),
411          .enabled = COND(state->hs, true),
412    ));
413    OUT_REG(ring, HLSQ_DS_CNTL(
414          CHIP,
415          .constlen = COND(state->ds, state->ds->constlen),
416          .enabled = COND(state->ds, true),
417    ));
418    OUT_REG(ring, HLSQ_GS_CNTL(
419          CHIP,
420          .constlen = COND(state->gs, state->gs->constlen),
421          .enabled = COND(state->gs, true),
422    ));
423    OUT_REG(ring, HLSQ_FS_CNTL(
424          CHIP,
425          .constlen = state->fs->constlen,
426          .enabled = true,
427    ));
428 
429    OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1);
430    OUT_RING(ring, sp_xs_config(state->vs));
431 
432    OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1);
433    OUT_RING(ring, sp_xs_config(state->hs));
434 
435    OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1);
436    OUT_RING(ring, sp_xs_config(state->ds));
437 
438    OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1);
439    OUT_RING(ring, sp_xs_config(state->gs));
440 
441    OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1);
442    OUT_RING(ring, sp_xs_config(state->fs));
443 
444    OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1);
445    OUT_RING(ring, ir3_shader_nibo(state->fs));
446 
447    state->config_stateobj = ring;
448 }
449 
450 static inline uint32_t
next_regid(uint32_t reg,uint32_t increment)451 next_regid(uint32_t reg, uint32_t increment)
452 {
453    if (VALIDREG(reg))
454       return reg + increment;
455    else
456       return regid(63, 0);
457 }
458 
459 static void
fd6_emit_tess_bos(struct fd_screen * screen,struct fd_ringbuffer * ring,const struct ir3_shader_variant * s)460 fd6_emit_tess_bos(struct fd_screen *screen, struct fd_ringbuffer *ring,
461                   const struct ir3_shader_variant *s) assert_dt
462 {
463    const struct ir3_const_state *const_state = ir3_const_state(s);
464    const unsigned regid = const_state->offsets.primitive_param + 1;
465    uint32_t dwords = 8;
466 
467    if (regid >= s->constlen)
468       return;
469 
470    fd_ringbuffer_attach_bo(ring, screen->tess_bo);
471 
472    OUT_PKT7(ring, fd6_stage2opcode(s->type), 7);
473    OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid) |
474                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
475                      CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
476                      CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(s->type)) |
477                      CP_LOAD_STATE6_0_NUM_UNIT(dwords / 4));
478    OUT_RING(ring, 0);
479    OUT_RING(ring, 0);
480    OUT_RELOC(ring, screen->tess_bo, FD6_TESS_FACTOR_SIZE, 0, 0);
481    OUT_RELOC(ring, screen->tess_bo, 0, 0, 0);
482 }
483 
484 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)485 primitive_to_tess(enum mesa_prim primitive)
486 {
487    switch (primitive) {
488    case MESA_PRIM_POINTS:
489       return TESS_POINTS;
490    case MESA_PRIM_LINE_STRIP:
491       return TESS_LINES;
492    case MESA_PRIM_TRIANGLE_STRIP:
493       return TESS_CW_TRIS;
494    default:
495       unreachable("");
496    }
497 }
498 
499 #define MAX_VERTEX_ATTRIBS 32
500 
501 static void
emit_vfd_dest(struct fd_ringbuffer * ring,const struct ir3_shader_variant * vs)502 emit_vfd_dest(struct fd_ringbuffer *ring, const struct ir3_shader_variant *vs)
503 {
504    uint32_t attr_count = 0;
505 
506    for (uint32_t i = 0; i < vs->inputs_count; i++)
507       if (!vs->inputs[i].sysval)
508          attr_count++;
509 
510    OUT_REG(ring, A6XX_VFD_CONTROL_0(
511                      .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
512                      .decode_cnt = attr_count));
513 
514    if (attr_count)
515       OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
516 
517    for (uint32_t i = 0; i < attr_count; i++) {
518       assert(vs->inputs[i].compmask);
519       assert(!vs->inputs[i].sysval);
520       OUT_RING(ring,
521                A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) |
522                   A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid));
523    }
524 }
525 
526 static void
emit_vs_system_values(struct fd_ringbuffer * ring,const struct program_builder * b)527 emit_vs_system_values(struct fd_ringbuffer *ring,
528                       const struct program_builder *b)
529 {
530    const uint32_t vertexid_regid =
531          ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_VERTEX_ID);
532    const uint32_t instanceid_regid =
533          ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_INSTANCE_ID);
534    const uint32_t tess_coord_x_regid =
535          ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_TESS_COORD);
536    const uint32_t tess_coord_y_regid = next_regid(tess_coord_x_regid, 1);
537    const uint32_t hs_rel_patch_regid =
538          ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
539    const uint32_t ds_rel_patch_regid =
540          ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
541    const uint32_t hs_invocation_regid =
542          ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_TCS_HEADER_IR3);
543    const uint32_t gs_primitiveid_regid =
544          ir3_find_sysval_regid(b->gs, SYSTEM_VALUE_PRIMITIVE_ID);
545    const uint32_t vs_primitiveid_regid = b->hs ?
546          ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_PRIMITIVE_ID) :
547          gs_primitiveid_regid;
548    const uint32_t ds_primitiveid_regid =
549          ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_PRIMITIVE_ID);
550    const uint32_t gsheader_regid =
551          ir3_find_sysval_regid(b->gs, SYSTEM_VALUE_GS_HEADER_IR3);
552 
553    /* Note: we currently don't support multiview.
554     */
555    const uint32_t viewid_regid = INVALID_REG;
556 
557    OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6);
558    OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
559                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
560                   A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
561                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
562    OUT_RING(ring, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
563                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
564    OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
565                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
566                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
567                   A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
568    OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */
569    OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
570                   0xfc00); /* VFD_CONTROL_5 */
571    OUT_RING(ring, COND(b->fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN)); /* VFD_CONTROL_6 */
572 }
573 
574 static void
emit_vpc(struct fd_ringbuffer * ring,const struct program_builder * b)575 emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b)
576 {
577    const struct ir3_shader_variant *last_shader = b->last_shader;
578 
579    /* note: doesn't compile as static because of the array regs.. */
580    const struct reg_config {
581       uint16_t reg_sp_xs_out_reg;
582       uint16_t reg_sp_xs_vpc_dst_reg;
583       uint16_t reg_vpc_xs_pack;
584       uint16_t reg_vpc_xs_clip_cntl;
585       uint16_t reg_gras_xs_cl_cntl;
586       uint16_t reg_pc_xs_out_cntl;
587       uint16_t reg_sp_xs_primitive_cntl;
588       uint16_t reg_vpc_xs_layer_cntl;
589       uint16_t reg_gras_xs_layer_cntl;
590    } reg_config[] = {
591       [MESA_SHADER_VERTEX] = {
592          REG_A6XX_SP_VS_OUT_REG(0),
593          REG_A6XX_SP_VS_VPC_DST_REG(0),
594          REG_A6XX_VPC_VS_PACK,
595          REG_A6XX_VPC_VS_CLIP_CNTL,
596          REG_A6XX_GRAS_VS_CL_CNTL,
597          REG_A6XX_PC_VS_OUT_CNTL,
598          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
599          REG_A6XX_VPC_VS_LAYER_CNTL,
600          REG_A6XX_GRAS_VS_LAYER_CNTL
601       },
602       [MESA_SHADER_TESS_CTRL] = {
603          0,
604          0,
605          0,
606          0,
607          0,
608          REG_A6XX_PC_HS_OUT_CNTL,
609          0,
610          0,
611          0
612       },
613       [MESA_SHADER_TESS_EVAL] = {
614          REG_A6XX_SP_DS_OUT_REG(0),
615          REG_A6XX_SP_DS_VPC_DST_REG(0),
616          REG_A6XX_VPC_DS_PACK,
617          REG_A6XX_VPC_DS_CLIP_CNTL,
618          REG_A6XX_GRAS_DS_CL_CNTL,
619          REG_A6XX_PC_DS_OUT_CNTL,
620          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
621          REG_A6XX_VPC_DS_LAYER_CNTL,
622          REG_A6XX_GRAS_DS_LAYER_CNTL
623       },
624       [MESA_SHADER_GEOMETRY] = {
625          REG_A6XX_SP_GS_OUT_REG(0),
626          REG_A6XX_SP_GS_VPC_DST_REG(0),
627          REG_A6XX_VPC_GS_PACK,
628          REG_A6XX_VPC_GS_CLIP_CNTL,
629          REG_A6XX_GRAS_GS_CL_CNTL,
630          REG_A6XX_PC_GS_OUT_CNTL,
631          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
632          REG_A6XX_VPC_GS_LAYER_CNTL,
633          REG_A6XX_GRAS_GS_LAYER_CNTL
634       },
635    };
636    const struct reg_config *cfg = &reg_config[b->last_shader->type];
637 
638    struct ir3_shader_linkage linkage = {
639       .primid_loc = 0xff,
640       .clip0_loc = 0xff,
641       .clip1_loc = 0xff,
642    };
643 
644    /* If we have streamout, link against the real FS, rather than the
645     * dummy FS used for binning pass state, to ensure the OUTLOC's
646     * match.  Depending on whether we end up doing sysmem or gmem,
647     * the actual streamout could happen with either the binning pass
648     * or draw pass program, but the same streamout stateobj is used
649     * in either case:
650     */
651    bool do_streamout = (b->last_shader->stream_output.num_outputs > 0);
652    ir3_link_shaders(&linkage, b->last_shader,
653                     do_streamout ? b->state->fs : b->fs,
654                     true);
655 
656    if (do_streamout)
657       ir3_link_stream_out(&linkage, b->last_shader);
658 
659    emit_vs_system_values(ring, b);
660 
661    OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4);
662    OUT_RING(ring, ~linkage.varmask[0]);
663    OUT_RING(ring, ~linkage.varmask[1]);
664    OUT_RING(ring, ~linkage.varmask[2]);
665    OUT_RING(ring, ~linkage.varmask[3]);
666 
667    /* a6xx finds position/pointsize at the end */
668    const uint32_t position_regid =
669       ir3_find_output_regid(last_shader, VARYING_SLOT_POS);
670    const uint32_t pointsize_regid =
671       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
672    const uint32_t layer_regid =
673       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
674    const uint32_t view_regid =
675       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
676    const uint32_t clip0_regid =
677       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
678    const uint32_t clip1_regid =
679       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
680    uint32_t flags_regid = b->gs ?
681       ir3_find_output_regid(b->gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
682 
683    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
684 
685 // XXX replace regid(63,0) with INVALID_REG
686    if (layer_regid != regid(63, 0)) {
687       layer_loc = linkage.max_loc;
688       ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
689    }
690 
691    if (view_regid != regid(63, 0)) {
692       view_loc = linkage.max_loc;
693       ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
694    }
695 
696    if (position_regid != regid(63, 0)) {
697       position_loc = linkage.max_loc;
698       ir3_link_add(&linkage, VARYING_SLOT_POS, position_regid, 0xf, linkage.max_loc);
699    }
700 
701    if (pointsize_regid != regid(63, 0)) {
702       pointsize_loc = linkage.max_loc;
703       ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
704    }
705 
706    uint8_t clip_mask = last_shader->clip_mask,
707            cull_mask = last_shader->cull_mask;
708    uint8_t clip_cull_mask = clip_mask | cull_mask;
709 
710    clip_mask &= b->key->clip_plane_enable;
711 
712    /* Handle the case where clip/cull distances aren't read by the FS */
713    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
714    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
715       clip0_loc = linkage.max_loc;
716       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
717                    clip_cull_mask & 0xf, linkage.max_loc);
718    }
719    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
720       clip1_loc = linkage.max_loc;
721       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
722                    clip_cull_mask >> 4, linkage.max_loc);
723    }
724 
725    /* If we have stream-out, we use the full shader for binning
726     * pass, rather than the optimized binning pass one, so that we
727     * have all the varying outputs available for xfb.  So streamout
728     * state should always be derived from the non-binning pass
729     * program:
730     */
731    if (do_streamout && !b->binning_pass) {
732       setup_stream_out(b->ctx, b->state, b->last_shader, &linkage);
733 
734       if (!fd6_context(b->ctx)->streamout_disable_stateobj)
735          setup_stream_out_disable(b->ctx);
736    }
737 
738    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
739     * at least when a DS is the last stage, so add a dummy output to keep it
740     * happy if there aren't any. We do this late in order to avoid emitting
741     * any unused code and make sure that optimizations don't remove it.
742     */
743    if (linkage.cnt == 0)
744       ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
745 
746    /* map outputs of the last shader to VPC */
747    assert(linkage.cnt <= 32);
748    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
749    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
750    uint16_t sp_out[32] = {0};
751    uint8_t sp_vpc_dst[32] = {0};
752    for (uint32_t i = 0; i < linkage.cnt; i++) {
753       sp_out[i] =
754          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
755          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
756       sp_vpc_dst[i] =
757          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
758    }
759 
760    OUT_PKT4(ring, cfg->reg_sp_xs_out_reg, sp_out_count);
761    OUT_BUF(ring, sp_out, sp_out_count);
762 
763    OUT_PKT4(ring, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
764    OUT_BUF(ring, sp_vpc_dst, sp_vpc_dst_count);
765 
766    OUT_PKT4(ring, cfg->reg_vpc_xs_pack, 1);
767    OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
768                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
769                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc));
770 
771    OUT_PKT4(ring, cfg->reg_vpc_xs_clip_cntl, 1);
772    OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
773                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
774                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
775 
776    OUT_PKT4(ring, cfg->reg_gras_xs_cl_cntl, 1);
777    OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) |
778                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask));
779 
780    const struct ir3_shader_variant *geom_stages[] = { b->vs, b->hs, b->ds, b->gs };
781 
782    for (unsigned i = 0; i < ARRAY_SIZE(geom_stages); i++) {
783       const struct ir3_shader_variant *shader = geom_stages[i];
784       if (!shader)
785          continue;
786 
787       bool primid = shader->type != MESA_SHADER_VERTEX &&
788          VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
789 
790       OUT_PKT4(ring, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
791       if (shader == last_shader) {
792          OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
793                         CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
794                         CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
795                         CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
796                         COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
797                         COND(primid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) |
798                         A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
799       } else {
800          OUT_RING(ring, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
801       }
802    }
803 
804    /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
805    assert(flags_regid != INVALID_REG);
806 
807    OUT_PKT4(ring, cfg->reg_sp_xs_primitive_cntl, 1);
808    OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
809                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
810 
811    OUT_PKT4(ring, cfg->reg_vpc_xs_layer_cntl, 1);
812    OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
813                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
814 
815    OUT_PKT4(ring, cfg->reg_gras_xs_layer_cntl, 1);
816    OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
817                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
818 
819    OUT_REG(ring, A6XX_PC_PS_CNTL(b->fs->reads_primid));
820 
821    OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1);
822    OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(b->fs->total_in) |
823                   COND(b->fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
824                   A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
825                   A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
826 
827    if (b->hs) {
828       OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
829       OUT_RING(ring, b->hs->tess.tcs_vertices_out);
830 
831       fd6_emit_link_map(b->vs, b->hs, ring);
832       fd6_emit_link_map(b->hs, b->ds, ring);
833    }
834 
835    if (b->gs) {
836       uint32_t vertices_out, invocations, vec4_size;
837       uint32_t prev_stage_output_size =
838          b->ds ? b->ds->output_size : b->vs->output_size;
839 
840       if (b->hs) {
841          fd6_emit_link_map(b->ds, b->gs, ring);
842       } else {
843          fd6_emit_link_map(b->vs, b->gs, ring);
844       }
845       vertices_out = b->gs->gs.vertices_out - 1;
846       enum a6xx_tess_output output =
847          primitive_to_tess((enum mesa_prim)b->gs->gs.output_primitive);
848       invocations = b->gs->gs.invocations - 1;
849       /* Size of per-primitive alloction in ldlw memory in vec4s. */
850       vec4_size = b->gs->gs.vertices_in *
851                   DIV_ROUND_UP(prev_stage_output_size, 4);
852 
853       OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
854       OUT_RING(ring,
855             A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
856             A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
857             A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
858 
859       OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1);
860       OUT_RING(ring, 0xff);
861 
862       OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
863       OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
864 
865       uint32_t prim_size = prev_stage_output_size;
866       if (prim_size > 64)
867          prim_size = 64;
868       else if (prim_size == 64)
869          prim_size = 63;
870 
871       OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1);
872       OUT_RING(ring, prim_size);
873    }
874 }
875 
876 static enum a6xx_tex_prefetch_cmd
tex_opc_to_prefetch_cmd(opc_t tex_opc)877 tex_opc_to_prefetch_cmd(opc_t tex_opc)
878 {
879    switch (tex_opc) {
880    case OPC_SAM:
881       return TEX_PREFETCH_SAM;
882    default:
883       unreachable("Unknown tex opc for prefeth cmd");
884    }
885 }
886 
887 template <chip CHIP>
888 static void
emit_fs_inputs(struct fd_ringbuffer * ring,const struct program_builder * b)889 emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b)
890 {
891    const struct ir3_shader_variant *fs = b->fs;
892    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
893    uint32_t ij_regid[IJ_COUNT];
894    uint32_t smask_in_regid;
895 
896    bool sample_shading = fs->per_samp | fs->key.sample_shading;
897    bool enable_varyings = fs->total_in > 0;
898 
899    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
900    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
901    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
902    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
903    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
904    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
905       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
906 
907    if (fs->num_sampler_prefetch > 0) {
908       /* It seems like ij_pix is *required* to be r0.x */
909       assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
910              ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
911    }
912 
913    OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
914    OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
915                      COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
916                           A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
917                      COND(fs->prefetch_end_of_quad,
918                           A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
919    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
920       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
921       OUT_RING(ring, SP_FS_PREFETCH_CMD(
922             CHIP, i,
923             .src = prefetch->src,
924             .samp_id = prefetch->samp_id,
925             .tex_id = prefetch->tex_id,
926             .dst = prefetch->dst,
927             .wrmask = prefetch->wrmask,
928             .half = prefetch->half_precision,
929             .bindless = prefetch->bindless,
930             .cmd = tex_opc_to_prefetch_cmd(prefetch->tex_opc),
931          ).value
932       );
933    }
934 
935    OUT_REG(ring,
936            HLSQ_CONTROL_1_REG(CHIP,
937             b->ctx->screen->info->a6xx.prim_alloc_threshold),
938            HLSQ_CONTROL_2_REG(
939                  CHIP,
940                  .faceregid = face_regid,
941                  .sampleid = samp_id_regid,
942                  .samplemask = smask_in_regid,
943                  .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW],
944            ),
945            HLSQ_CONTROL_3_REG(
946                  CHIP,
947                  .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
948                  .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
949                  .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
950                  .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID],
951            ),
952            HLSQ_CONTROL_4_REG(
953                  CHIP,
954                  .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
955                  .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
956                  .xycoordregid = coord_regid,
957                  .zwcoordregid = zwcoord_regid,
958            ),
959            HLSQ_CONTROL_5_REG(
960                  CHIP,
961                  .linelengthregid = INVALID_REG,
962                  .foveationqualityregid = INVALID_REG,
963            ),
964    );
965 
966    enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
967    OUT_REG(ring,
968            HLSQ_FS_CNTL_0(
969                  CHIP,
970                  .threadsize = thrsz,
971                  .varyings = enable_varyings,
972            ),
973    );
974 
975    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
976    bool need_size_persamp = false;
977    if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
978       if (sample_shading)
979          need_size_persamp = true;
980       else
981          need_size = true;
982    }
983 
984    OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1);
985    OUT_RING(ring,
986          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
987          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
988          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
989          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
990          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
991          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
992          COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
993          COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
994          COND(fs->fragcoord_compmask != 0,
995               A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
996 
997    OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2);
998    OUT_RING(ring,
999          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1000          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1001          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1002          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1003          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1004          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1005          COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1006          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1007          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1008          COND(fs->fragcoord_compmask != 0,
1009               A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1010    OUT_RING(ring,
1011          A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1012             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1013          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1014          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1015          CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1016          COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE) |
1017          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1018 
1019    OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1);
1020    OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1021 
1022    OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1023    OUT_RING(ring,
1024          CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1025          A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1026             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1027 
1028    OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1029    OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1030 }
1031 
1032 static void
emit_fs_outputs(struct fd_ringbuffer * ring,const struct program_builder * b)1033 emit_fs_outputs(struct fd_ringbuffer *ring, const struct program_builder *b)
1034 {
1035    const struct ir3_shader_variant *fs = b->fs;
1036    uint32_t smask_regid, posz_regid, stencilref_regid;
1037 
1038    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1039    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1040    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1041 
1042    /* we can't write gl_SampleMask for !msaa..  if b0 is zero then we
1043     * end up masking the single sample!!
1044     */
1045    if (!b->key->key.msaa)
1046       smask_regid = regid(63, 0);
1047 
1048    int output_reg_count = 0;
1049    uint32_t fragdata_regid[8];
1050 
1051    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1052       unsigned slot = fs->color0_mrt ? FRAG_RESULT_COLOR : FRAG_RESULT_DATA0 + i;
1053       fragdata_regid[i] = ir3_find_output_regid(fs, slot);
1054       if (VALIDREG(fragdata_regid[i]))
1055          output_reg_count = i + 1;
1056    }
1057 
1058    OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1059    OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1060                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1061                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1062                   COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1063 
1064    OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1065    for (uint32_t i = 0; i < output_reg_count; i++) {
1066       OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1067                      COND(fragdata_regid[i] & HALF_REG_ID,
1068                              A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
1069 
1070       if (VALIDREG(fragdata_regid[i])) {
1071          b->state->mrt_components |= 0xf << (i * 4);
1072       }
1073    }
1074 }
1075 
1076 template <chip CHIP>
1077 static void
setup_stateobj(struct fd_ringbuffer * ring,const struct program_builder * b)1078 setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b)
1079    assert_dt
1080 {
1081    fd6_emit_shader(b->ctx, ring, b->vs);
1082    fd6_emit_shader(b->ctx, ring, b->hs);
1083    fd6_emit_shader(b->ctx, ring, b->ds);
1084    fd6_emit_shader(b->ctx, ring, b->gs);
1085    if (!b->binning_pass)
1086       fd6_emit_shader(b->ctx, ring, b->fs);
1087 
1088    OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1089    OUT_RING(ring, 0);
1090 
1091    emit_vfd_dest(ring, b->vs);
1092 
1093    emit_vpc(ring, b);
1094 
1095    emit_fs_inputs<CHIP>(ring, b);
1096    emit_fs_outputs(ring, b);
1097 
1098    if (b->hs) {
1099       fd6_emit_tess_bos(b->ctx->screen, ring, b->hs);
1100       fd6_emit_tess_bos(b->ctx->screen, ring, b->ds);
1101    }
1102 
1103    if (b->hs) {
1104       uint32_t patch_control_points = b->key->patch_vertices;
1105 
1106       uint32_t patch_local_mem_size_16b =
1107          patch_control_points * b->vs->output_size / 4;
1108 
1109       /* Total attribute slots in HS incoming patch. */
1110       OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1111       OUT_RING(ring, patch_local_mem_size_16b);
1112 
1113       const uint32_t wavesize = 64;
1114       const uint32_t vs_hs_local_mem_size = 16384;
1115 
1116       uint32_t max_patches_per_wave;
1117       if (b->ctx->screen->info->a6xx.tess_use_shared) {
1118          /* HS invocations for a patch are always within the same wave,
1119          * making barriers less expensive. VS can't have barriers so we
1120          * don't care about VS invocations being in the same wave.
1121          */
1122          max_patches_per_wave = wavesize / b->hs->tess.tcs_vertices_out;
1123       } else {
1124       /* VS is also in the same wave */
1125          max_patches_per_wave =
1126             wavesize / MAX2(patch_control_points,
1127                             b->hs->tess.tcs_vertices_out);
1128       }
1129 
1130 
1131       uint32_t patches_per_wave =
1132          MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
1133               max_patches_per_wave);
1134 
1135       uint32_t wave_input_size = DIV_ROUND_UP(
1136          patches_per_wave * patch_local_mem_size_16b * 16, 256);
1137 
1138       OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1139       OUT_RING(ring, wave_input_size);
1140 
1141       enum a6xx_tess_output output;
1142       if (b->ds->tess.point_mode)
1143          output = TESS_POINTS;
1144       else if (b->ds->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES)
1145          output = TESS_LINES;
1146       else if (b->ds->tess.ccw)
1147          output = TESS_CCW_TRIS;
1148       else
1149          output = TESS_CW_TRIS;
1150 
1151       OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1);
1152       OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING(
1153                         fd6_gl2spacing(b->ds->tess.spacing)) |
1154                         A6XX_PC_TESS_CNTL_OUTPUT(output));
1155    }
1156 }
1157 
1158 static void emit_interp_state(struct fd_ringbuffer *ring,
1159                               const struct fd6_program_state *state,
1160                               bool rasterflat,
1161                               bool sprite_coord_mode,
1162                               uint32_t sprite_coord_enable);
1163 
1164 static struct fd_ringbuffer *
create_interp_stateobj(struct fd_context * ctx,struct fd6_program_state * state)1165 create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state)
1166 {
1167    struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4);
1168 
1169    emit_interp_state(ring, state, false, false, 0);
1170 
1171    return ring;
1172 }
1173 
1174 /* build the program streaming state which is not part of the pre-
1175  * baked stateobj because of dependency on other gl state (rasterflat
1176  * or sprite-coord-replacement)
1177  */
1178 struct fd_ringbuffer *
fd6_program_interp_state(struct fd6_emit * emit)1179 fd6_program_interp_state(struct fd6_emit *emit)
1180 {
1181    const struct fd6_program_state *state = fd6_emit_get_prog(emit);
1182 
1183    if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) {
1184       /* fastpath: */
1185       return fd_ringbuffer_ref(state->interp_stateobj);
1186    } else {
1187       struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
1188          emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING);
1189 
1190       emit_interp_state(ring, state, emit->rasterflat,
1191                         emit->sprite_coord_mode, emit->sprite_coord_enable);
1192 
1193       return ring;
1194    }
1195 }
1196 
1197 static void
emit_interp_state(struct fd_ringbuffer * ring,const struct fd6_program_state * state,bool rasterflat,bool sprite_coord_mode,uint32_t sprite_coord_enable)1198 emit_interp_state(struct fd_ringbuffer *ring, const struct fd6_program_state *state,
1199                   bool rasterflat, bool sprite_coord_mode,
1200                   uint32_t sprite_coord_enable)
1201 {
1202    const struct ir3_shader_variant *fs = state->fs;
1203    uint32_t vinterp[8], vpsrepl[8];
1204 
1205    memset(vinterp, 0, sizeof(vinterp));
1206    memset(vpsrepl, 0, sizeof(vpsrepl));
1207 
1208    for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count;) {
1209 
1210       /* NOTE: varyings are packed, so if compmask is 0xb
1211        * then first, third, and fourth component occupy
1212        * three consecutive varying slots:
1213        */
1214       unsigned compmask = fs->inputs[j].compmask;
1215 
1216       uint32_t inloc = fs->inputs[j].inloc;
1217 
1218       bool coord_mode = sprite_coord_mode;
1219       if (ir3_point_sprite(fs, j, sprite_coord_enable, &coord_mode)) {
1220          /* mask is two 2-bit fields, where:
1221           *   '01' -> S
1222           *   '10' -> T
1223           *   '11' -> 1 - T  (flip mode)
1224           */
1225          unsigned mask = coord_mode ? 0b1101 : 0b1001;
1226          uint32_t loc = inloc;
1227          if (compmask & 0x1) {
1228             vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2);
1229             loc++;
1230          }
1231          if (compmask & 0x2) {
1232             vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2);
1233             loc++;
1234          }
1235          if (compmask & 0x4) {
1236             /* .z <- 0.0f */
1237             vinterp[loc / 16] |= INTERP_ZERO << ((loc % 16) * 2);
1238             loc++;
1239          }
1240          if (compmask & 0x8) {
1241             /* .w <- 1.0f */
1242             vinterp[loc / 16] |= INTERP_ONE << ((loc % 16) * 2);
1243             loc++;
1244          }
1245       } else if (fs->inputs[j].slot == VARYING_SLOT_LAYER ||
1246                  fs->inputs[j].slot == VARYING_SLOT_VIEWPORT) {
1247          const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
1248          uint32_t loc = inloc;
1249 
1250          /* If the last geometry shader doesn't statically write these, they're
1251           * implicitly zero and the FS is supposed to read zero.
1252           */
1253          if (ir3_find_output(last_shader, (gl_varying_slot)fs->inputs[j].slot) < 0 &&
1254              (compmask & 0x1)) {
1255             vinterp[loc / 16] |= INTERP_ZERO << ((loc % 16) * 2);
1256          } else {
1257             vinterp[loc / 16] |= INTERP_FLAT << ((loc % 16) * 2);
1258          }
1259       } else if (fs->inputs[j].flat || (fs->inputs[j].rasterflat && rasterflat)) {
1260          uint32_t loc = inloc;
1261 
1262          for (int i = 0; i < 4; i++) {
1263             if (compmask & (1 << i)) {
1264                vinterp[loc / 16] |= INTERP_FLAT << ((loc % 16) * 2);
1265                loc++;
1266             }
1267          }
1268       }
1269    }
1270 
1271    OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1272    for (int i = 0; i < 8; i++)
1273       OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */
1274 
1275    OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1276    for (int i = 0; i < 8; i++)
1277       OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */
1278 }
1279 
1280 template <chip CHIP>
1281 static struct ir3_program_state *
fd6_program_create(void * data,const struct ir3_shader_variant * bs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,const struct ir3_cache_key * key)1282 fd6_program_create(void *data, const struct ir3_shader_variant *bs,
1283                    const struct ir3_shader_variant *vs,
1284                    const struct ir3_shader_variant *hs,
1285                    const struct ir3_shader_variant *ds,
1286                    const struct ir3_shader_variant *gs,
1287                    const struct ir3_shader_variant *fs,
1288                    const struct ir3_cache_key *key) in_dt
1289 {
1290    struct fd_context *ctx = fd_context((struct pipe_context *)data);
1291    struct fd_screen *screen = ctx->screen;
1292    struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state);
1293 
1294    tc_assert_driver_thread(ctx->tc);
1295 
1296    /* if we have streamout, use full VS in binning pass, as the
1297     * binning pass VS will have outputs on other than position/psize
1298     * stripped out:
1299     */
1300    state->bs = vs->stream_output.num_outputs ? vs : bs;
1301    state->vs = vs;
1302    state->hs = hs;
1303    state->ds = ds;
1304    state->gs = gs;
1305    state->fs = fs;
1306    state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
1307    state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
1308 
1309 #ifdef DEBUG
1310    if (!ds) {
1311       for (unsigned i = 0; i < bs->inputs_count; i++) {
1312          if (vs->inputs[i].sysval)
1313             continue;
1314          assert(bs->inputs[i].regid == vs->inputs[i].regid);
1315       }
1316    }
1317 #endif
1318 
1319    if (hs) {
1320       /* Allocate the fixed-size tess factor BO globally on the screen.  This
1321        * lets the program (which ideally we would have shared across contexts,
1322        * though the current ir3_cache impl doesn't do that) bake in the
1323        * addresses.
1324        */
1325       fd_screen_lock(screen);
1326       if (!screen->tess_bo)
1327          screen->tess_bo =
1328             fd_bo_new(screen->dev, FD6_TESS_BO_SIZE, FD_BO_NOMAP, "tessfactor");
1329       fd_screen_unlock(screen);
1330    }
1331 
1332    /* Dummy frag shader used for binning pass: */
1333    static const struct ir3_shader_variant dummy_fs = {
1334          .info = {
1335                .max_reg = -1,
1336                .max_half_reg = -1,
1337                .max_const = -1,
1338          },
1339    };
1340    /* The last geometry stage in use: */
1341    const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
1342 
1343    setup_config_stateobj<CHIP>(ctx, state);
1344 
1345    struct program_builder b = {
1346       .state = state,
1347       .ctx = ctx,
1348       .key = key,
1349       .hs  = state->hs,
1350       .ds  = state->ds,
1351       .gs  = state->gs,
1352    };
1353 
1354    /*
1355     * Setup binning pass program state:
1356     */
1357 
1358    /* binning VS is wrong when GS is present, so use nonbinning VS
1359     * TODO: compile both binning VS/GS variants correctly
1360     *
1361     * If we have stream-out, we use the full shader for binning
1362     * pass, rather than the optimized binning pass one, so that we
1363     * have all the varying outputs available for xfb.  So streamout
1364     * state should always be derived from the non-binning pass
1365     * program.
1366     */
1367    b.vs  = state->gs || last_shader->stream_output.num_outputs ?
1368            state->vs : state->bs;
1369    b.fs  = &dummy_fs;
1370    b.last_shader  = last_shader->type != MESA_SHADER_VERTEX ?
1371                     last_shader : state->bs;
1372    b.binning_pass = true;
1373 
1374    setup_stateobj<CHIP>(state->binning_stateobj, &b);
1375 
1376    /*
1377     * Setup draw pass program state:
1378     */
1379    b.vs = state->vs;
1380    b.fs = state->fs;
1381    b.last_shader = last_shader;
1382    b.binning_pass = false;
1383 
1384    setup_stateobj<CHIP>(state->stateobj, &b);
1385 
1386    state->interp_stateobj = create_interp_stateobj(ctx, state);
1387 
1388    const struct ir3_stream_output_info *stream_output = &last_shader->stream_output;
1389    if (stream_output->num_outputs > 0)
1390       state->stream_output = stream_output;
1391 
1392    bool has_viewport =
1393       VALIDREG(ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT));
1394    state->num_viewports = has_viewport ? PIPE_MAX_VIEWPORTS : 1;
1395 
1396    /* Note that binning pass uses same const state as draw pass: */
1397    state->user_consts_cmdstream_size =
1398          fd6_user_consts_cmdstream_size(state->vs) +
1399          fd6_user_consts_cmdstream_size(state->hs) +
1400          fd6_user_consts_cmdstream_size(state->ds) +
1401          fd6_user_consts_cmdstream_size(state->gs) +
1402          fd6_user_consts_cmdstream_size(state->fs);
1403 
1404    unsigned num_dp = 0;
1405    if (vs->need_driver_params)
1406       num_dp++;
1407    if (gs && gs->need_driver_params)
1408       num_dp++;
1409    if (hs && hs->need_driver_params)
1410       num_dp++;
1411    if (ds && ds->need_driver_params)
1412       num_dp++;
1413 
1414    state->num_driver_params = num_dp;
1415 
1416    /* dual source blending has an extra fs output in the 2nd slot */
1417    if (fs->fs.color_is_dual_source) {
1418       state->mrt_components |= 0xf << 4;
1419    }
1420 
1421    state->lrz_mask.val = ~0;
1422 
1423    if (fs->has_kill) {
1424       state->lrz_mask.write = false;
1425    }
1426 
1427    if (fs->no_earlyz || fs->writes_pos) {
1428       state->lrz_mask.enable = false;
1429       state->lrz_mask.write = false;
1430       state->lrz_mask.test = false;
1431    }
1432 
1433    if (fs->fs.early_fragment_tests) {
1434       state->lrz_mask.z_mode = A6XX_EARLY_Z;
1435    } else if (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref) {
1436       state->lrz_mask.z_mode = A6XX_LATE_Z;
1437    } else {
1438       /* Wildcard indicates that we need to figure out at draw time: */
1439       state->lrz_mask.z_mode = A6XX_INVALID_ZTEST;
1440    }
1441 
1442    return &state->base;
1443 }
1444 
1445 static void
fd6_program_destroy(void * data,struct ir3_program_state * state)1446 fd6_program_destroy(void *data, struct ir3_program_state *state)
1447 {
1448    struct fd6_program_state *so = fd6_program_state(state);
1449    fd_ringbuffer_del(so->stateobj);
1450    fd_ringbuffer_del(so->binning_stateobj);
1451    fd_ringbuffer_del(so->config_stateobj);
1452    fd_ringbuffer_del(so->interp_stateobj);
1453    if (so->streamout_stateobj)
1454       fd_ringbuffer_del(so->streamout_stateobj);
1455    free(so);
1456 }
1457 
1458 template <chip CHIP>
1459 static const struct ir3_cache_funcs cache_funcs = {
1460    .create_state = fd6_program_create<CHIP>,
1461    .destroy_state = fd6_program_destroy,
1462 };
1463 
1464 template <chip CHIP>
1465 void
fd6_prog_init(struct pipe_context * pctx)1466 fd6_prog_init(struct pipe_context *pctx)
1467 {
1468    struct fd_context *ctx = fd_context(pctx);
1469 
1470    ctx->shader_cache = ir3_cache_create(&cache_funcs<CHIP>, ctx);
1471 
1472    ir3_prog_init(pctx);
1473 
1474    fd_prog_init(pctx);
1475 }
1476 
1477 /* Teach the compiler about needed variants: */
1478 template void fd6_prog_init<A6XX>(struct pipe_context *pctx);
1479 template void fd6_prog_init<A7XX>(struct pipe_context *pctx);
1480