• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "ac_nir.h"
26 #include "ac_sqtt.h"
27 #include "si_build_pm4.h"
28 #include "util/u_cpu_detect.h"
29 #include "util/u_index_modify.h"
30 #include "util/u_prim.h"
31 #include "util/u_upload_mgr.h"
32 
33 #if (GFX_VER == 6)
34 #define GFX(name) name##GFX6
35 #elif (GFX_VER == 7)
36 #define GFX(name) name##GFX7
37 #elif (GFX_VER == 8)
38 #define GFX(name) name##GFX8
39 #elif (GFX_VER == 9)
40 #define GFX(name) name##GFX9
41 #elif (GFX_VER == 10)
42 #define GFX(name) name##GFX10
43 #elif (GFX_VER == 103)
44 #define GFX(name) name##GFX10_3
45 #elif (GFX_VER == 11)
46 #define GFX(name) name##GFX11
47 #else
48 #error "Unknown gfx level"
49 #endif
50 
51 /* special primitive types */
52 #define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
53 
54 template<int NUM_INTERP>
si_emit_spi_map(struct si_context * sctx)55 static void si_emit_spi_map(struct si_context *sctx)
56 {
57    struct si_shader *ps = sctx->shader.ps.current;
58    struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
59    unsigned spi_ps_input_cntl[NUM_INTERP];
60 
61    STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32);
62 
63    if (!NUM_INTERP)
64       return;
65 
66    struct si_shader *vs = si_get_vs(sctx)->current;
67    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
68 
69    for (unsigned i = 0; i < NUM_INTERP; i++) {
70       union si_input_info input = psinfo->input[i];
71       unsigned ps_input_cntl = vs->info.vs_output_ps_input_cntl[input.semantic];
72       bool non_default_val = G_028644_OFFSET(ps_input_cntl) != 0x20;
73 
74       if (non_default_val) {
75          if (input.interpolate == INTERP_MODE_FLAT ||
76              (input.interpolate == INTERP_MODE_COLOR && rs->flatshade))
77             ps_input_cntl |= S_028644_FLAT_SHADE(1);
78 
79          if (input.fp16_lo_hi_valid) {
80             ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
81                              S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */
82                              S_028644_ATTR1_VALID(!!(input.fp16_lo_hi_valid & 0x2));
83          }
84       }
85 
86       if (input.semantic == VARYING_SLOT_PNTC ||
87           (input.semantic >= VARYING_SLOT_TEX0 && input.semantic <= VARYING_SLOT_TEX7 &&
88            rs->sprite_coord_enable & (1 << (input.semantic - VARYING_SLOT_TEX0)))) {
89          /* Overwrite the whole value (except OFFSET) for sprite coordinates. */
90          ps_input_cntl &= ~C_028644_OFFSET;
91          ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
92          if (input.fp16_lo_hi_valid & 0x1) {
93             ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
94                              S_028644_ATTR0_VALID(1);
95          }
96       }
97 
98       spi_ps_input_cntl[i] = ps_input_cntl;
99    }
100 
101    /* R_028644_SPI_PS_INPUT_CNTL_0 */
102    /* Dota 2: Only ~16% of SPI map updates set different values. */
103    /* Talos: Only ~9% of SPI map updates set different values. */
104    radeon_begin(&sctx->gfx_cs);
105    radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
106                                sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP);
107    radeon_end_update_context_roll(sctx);
108 }
109 
110 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
si_update_shaders(struct si_context * sctx)111 static bool si_update_shaders(struct si_context *sctx)
112 {
113    struct pipe_context *ctx = (struct pipe_context *)sctx;
114    struct si_shader *old_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current;
115    unsigned old_pa_cl_vs_out_cntl = old_vs ? old_vs->pa_cl_vs_out_cntl : 0;
116    struct si_shader *old_ps = sctx->shader.ps.current;
117    unsigned old_spi_shader_col_format =
118       old_ps ? old_ps->key.ps.part.epilog.spi_shader_col_format : 0;
119    int r;
120 
121    /* Update TCS and TES. */
122    if (HAS_TESS) {
123       if (!sctx->tess_rings) {
124          si_init_tess_factor_ring(sctx);
125          if (!sctx->tess_rings)
126             return false;
127       }
128 
129       if (!sctx->is_user_tcs) {
130          if (!si_set_tcs_to_fixed_func_shader(sctx))
131             return false;
132       }
133 
134       r = si_shader_select(ctx, &sctx->shader.tcs);
135       if (r)
136          return false;
137       si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current);
138 
139       if (!HAS_GS || GFX_VERSION <= GFX8) {
140          r = si_shader_select(ctx, &sctx->shader.tes);
141          if (r)
142             return false;
143 
144          if (HAS_GS) {
145             /* TES as ES */
146             assert(GFX_VERSION <= GFX8);
147             si_pm4_bind_state(sctx, es, sctx->shader.tes.current);
148          } else if (NGG) {
149             si_pm4_bind_state(sctx, gs, sctx->shader.tes.current);
150          } else {
151             si_pm4_bind_state(sctx, vs, sctx->shader.tes.current);
152          }
153       }
154    } else {
155       /* Reset TCS to clear fixed function shader. */
156       if (!sctx->is_user_tcs && sctx->shader.tcs.cso) {
157          sctx->shader.tcs.cso = NULL;
158          sctx->shader.tcs.current = NULL;
159       }
160 
161       if (GFX_VERSION <= GFX8) {
162          si_pm4_bind_state(sctx, ls, NULL);
163          sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
164       }
165       si_pm4_bind_state(sctx, hs, NULL);
166       sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
167    }
168 
169    /* Update GS. */
170    if (HAS_GS) {
171       r = si_shader_select(ctx, &sctx->shader.gs);
172       if (r)
173          return false;
174       si_pm4_bind_state(sctx, gs, sctx->shader.gs.current);
175       if (!NGG) {
176          si_pm4_bind_state(sctx, vs, sctx->shader.gs.current->gs_copy_shader);
177 
178          if (!si_update_gs_ring_buffers(sctx))
179             return false;
180       } else if (GFX_VERSION < GFX11) {
181          si_pm4_bind_state(sctx, vs, NULL);
182          sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
183       }
184    } else {
185       if (!NGG) {
186          si_pm4_bind_state(sctx, gs, NULL);
187          sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
188          if (GFX_VERSION <= GFX8) {
189             si_pm4_bind_state(sctx, es, NULL);
190             sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
191          }
192       }
193    }
194 
195    /* Update VS. */
196    if ((!HAS_TESS && !HAS_GS) || GFX_VERSION <= GFX8) {
197       r = si_shader_select(ctx, &sctx->shader.vs);
198       if (r)
199          return false;
200 
201       if (!HAS_TESS && !HAS_GS) {
202          if (NGG) {
203             si_pm4_bind_state(sctx, gs, sctx->shader.vs.current);
204             if (GFX_VERSION < GFX11) {
205                si_pm4_bind_state(sctx, vs, NULL);
206                sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
207             }
208          } else {
209             si_pm4_bind_state(sctx, vs, sctx->shader.vs.current);
210          }
211       } else if (HAS_TESS) {
212          si_pm4_bind_state(sctx, ls, sctx->shader.vs.current);
213       } else {
214          assert(HAS_GS);
215          si_pm4_bind_state(sctx, es, sctx->shader.vs.current);
216       }
217    }
218 
219    if (GFX_VERSION >= GFX9 && HAS_TESS)
220       sctx->vs_uses_base_instance = sctx->queued.named.hs->uses_base_instance;
221    else if (GFX_VERSION >= GFX9 && HAS_GS)
222       sctx->vs_uses_base_instance = sctx->shader.gs.current->uses_base_instance;
223    else
224       sctx->vs_uses_base_instance = sctx->shader.vs.current->uses_base_instance;
225 
226    union si_vgt_stages_key key;
227    key.index = 0;
228 
229    /* Update VGT_SHADER_STAGES_EN. */
230    if (HAS_TESS) {
231       key.u.tess = 1;
232       if (GFX_VERSION >= GFX10)
233          key.u.hs_wave32 = sctx->queued.named.hs->wave_size == 32;
234    }
235    if (HAS_GS)
236       key.u.gs = 1;
237    if (NGG) {
238       key.index |= si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ctx_reg.ngg.vgt_stages.index;
239    } else if (GFX_VERSION >= GFX10) {
240       if (HAS_GS) {
241          key.u.gs_wave32 = sctx->shader.gs.current->wave_size == 32;
242          key.u.vs_wave32 = sctx->shader.gs.current->gs_copy_shader->wave_size == 32;
243       } else {
244          key.u.vs_wave32 = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->wave_size == 32;
245       }
246    }
247 
248    struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
249    if (unlikely(!*pm4))
250       *pm4 = si_build_vgt_shader_config(sctx->screen, key);
251    si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
252 
253    if (old_pa_cl_vs_out_cntl !=
254           si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->pa_cl_vs_out_cntl)
255       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
256 
257    r = si_shader_select(ctx, &sctx->shader.ps);
258    if (r)
259       return false;
260    si_pm4_bind_state(sctx, ps, sctx->shader.ps.current);
261 
262    unsigned db_shader_control = sctx->shader.ps.current->ctx_reg.ps.db_shader_control;
263    if (sctx->ps_db_shader_control != db_shader_control) {
264       sctx->ps_db_shader_control = db_shader_control;
265       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
266       if (sctx->screen->dpbb_allowed)
267          si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
268    }
269 
270    if (si_pm4_state_changed(sctx, ps) ||
271        (!NGG && si_pm4_state_changed(sctx, vs)) ||
272        (NGG && si_pm4_state_changed(sctx, gs))) {
273       sctx->atoms.s.spi_map.emit = sctx->emit_spi_map[sctx->shader.ps.current->ctx_reg.ps.num_interp];
274       si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
275    }
276 
277    if ((GFX_VERSION >= GFX10_3 || (GFX_VERSION >= GFX9 && sctx->screen->info.rbplus_allowed)) &&
278        si_pm4_state_changed(sctx, ps) &&
279        (!old_ps || old_spi_shader_col_format !=
280                       sctx->shader.ps.current->key.ps.part.epilog.spi_shader_col_format))
281       si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
282 
283    if (sctx->smoothing_enabled !=
284        sctx->shader.ps.current->key.ps.mono.poly_line_smoothing) {
285       sctx->smoothing_enabled = sctx->shader.ps.current->key.ps.mono.poly_line_smoothing;
286       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
287 
288       /* NGG cull state uses smoothing_enabled. */
289       if (GFX_VERSION >= GFX10 && sctx->screen->use_ngg_culling)
290          si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
291 
292       if (GFX_VERSION == GFX6 ||
293           (GFX_VERSION == GFX11 && sctx->screen->info.has_export_conflict_bug))
294          si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
295 
296       if (sctx->framebuffer.nr_samples <= 1)
297          si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
298    }
299 
300    if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) {
301       /* Pretend the bound shaders form a vk pipeline */
302       uint32_t pipeline_code_hash = 0;
303       uint64_t base_address = ~0;
304 
305       for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
306          struct si_shader *shader = sctx->shaders[i].current;
307          if (sctx->shaders[i].cso && shader) {
308             pipeline_code_hash = _mesa_hash_data_with_seed(
309                shader->binary.elf_buffer,
310                shader->binary.elf_size,
311                pipeline_code_hash);
312             base_address = MIN2(base_address,
313                                 shader->bo->gpu_address);
314          }
315       }
316 
317       struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
318       if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
319          si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false);
320       }
321 
322       si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0);
323    }
324 
325    if ((GFX_VERSION <= GFX8 &&
326         (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, es))) ||
327        si_pm4_state_enabled_and_changed(sctx, hs) || si_pm4_state_enabled_and_changed(sctx, gs) ||
328        (!NGG && si_pm4_state_enabled_and_changed(sctx, vs)) || si_pm4_state_enabled_and_changed(sctx, ps)) {
329       unsigned scratch_size = 0;
330 
331       if (HAS_TESS) {
332          if (GFX_VERSION <= GFX8) /* LS */
333             scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave);
334 
335          scratch_size = MAX2(scratch_size, sctx->queued.named.hs->config.scratch_bytes_per_wave);
336 
337          if (HAS_GS) {
338             if (GFX_VERSION <= GFX8) /* ES */
339                scratch_size = MAX2(scratch_size, sctx->shader.tes.current->config.scratch_bytes_per_wave);
340 
341             scratch_size = MAX2(scratch_size, sctx->shader.gs.current->config.scratch_bytes_per_wave);
342          } else {
343             scratch_size = MAX2(scratch_size, sctx->shader.tes.current->config.scratch_bytes_per_wave);
344          }
345       } else if (HAS_GS) {
346          if (GFX_VERSION <= GFX8) /* ES */
347             scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave);
348 
349          scratch_size = MAX2(scratch_size, sctx->shader.gs.current->config.scratch_bytes_per_wave);
350       } else {
351          scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave);
352       }
353 
354       scratch_size = MAX2(scratch_size, sctx->shader.ps.current->config.scratch_bytes_per_wave);
355 
356       if (scratch_size && !si_update_spi_tmpring_size(sctx, scratch_size))
357          return false;
358 
359       if (GFX_VERSION >= GFX7) {
360          if (GFX_VERSION <= GFX8 && HAS_TESS && si_pm4_state_enabled_and_changed(sctx, ls))
361             sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
362 
363          if (HAS_TESS && si_pm4_state_enabled_and_changed(sctx, hs))
364             sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
365 
366          if (GFX_VERSION <= GFX8 && HAS_GS && si_pm4_state_enabled_and_changed(sctx, es))
367             sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
368 
369          if ((HAS_GS || NGG) && si_pm4_state_enabled_and_changed(sctx, gs))
370             sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
371 
372          if (!NGG && si_pm4_state_enabled_and_changed(sctx, vs))
373             sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
374 
375          if (si_pm4_state_enabled_and_changed(sctx, ps))
376             sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
377       }
378    }
379 
380    sctx->do_update_shaders = false;
381    return true;
382 }
383 
384 ALWAYS_INLINE
si_conv_pipe_prim(unsigned mode)385 static unsigned si_conv_pipe_prim(unsigned mode)
386 {
387    static const unsigned prim_conv[] = {
388       [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST,
389       [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST,
390       [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP,
391       [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP,
392       [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST,
393       [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP,
394       [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN,
395       [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST,
396       [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP,
397       [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON,
398       [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ,
399       [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ,
400       [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ,
401       [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ,
402       [PIPE_PRIM_PATCHES] = V_008958_DI_PT_PATCH,
403       [SI_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST};
404    assert(mode < ARRAY_SIZE(prim_conv));
405    return prim_conv[mode];
406 }
407 
408 template<amd_gfx_level GFX_VERSION>
si_cp_dma_prefetch_inline(struct si_context * sctx,struct pipe_resource * buf,unsigned offset,unsigned size)409 static void si_cp_dma_prefetch_inline(struct si_context *sctx, struct pipe_resource *buf,
410                                       unsigned offset, unsigned size)
411 {
412    uint64_t address = si_resource(buf)->gpu_address + offset;
413 
414    assert(GFX_VERSION >= GFX7);
415 
416    if (GFX_VERSION >= GFX11)
417       size = MIN2(size, 32768 - SI_CPDMA_ALIGNMENT);
418 
419    /* The prefetch address and size must be aligned, so that we don't have to apply
420     * the complicated hw bug workaround.
421     *
422     * The size should also be less than 2 MB, so that we don't have to use a loop.
423     * Callers shouldn't need to prefetch more than 2 MB.
424     */
425    assert(size % SI_CPDMA_ALIGNMENT == 0);
426    assert(address % SI_CPDMA_ALIGNMENT == 0);
427    assert(size < S_415_BYTE_COUNT_GFX6(~0u));
428 
429    uint32_t header = S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2);
430    uint32_t command = S_415_BYTE_COUNT_GFX6(size);
431 
432    if (GFX_VERSION >= GFX9) {
433       command |= S_415_DISABLE_WR_CONFIRM_GFX9(1);
434       header |= S_411_DST_SEL(V_411_NOWHERE);
435    } else {
436       command |= S_415_DISABLE_WR_CONFIRM_GFX6(1);
437       header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2);
438    }
439 
440    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
441    radeon_begin(cs);
442    radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
443    radeon_emit(header);
444    radeon_emit(address);       /* SRC_ADDR_LO [31:0] */
445    radeon_emit(address >> 32); /* SRC_ADDR_HI [31:0] */
446    radeon_emit(address);       /* DST_ADDR_LO [31:0] */
447    radeon_emit(address >> 32); /* DST_ADDR_HI [31:0] */
448    radeon_emit(command);
449    radeon_end();
450 }
451 
452 #if GFX_VER == 6 /* declare this function only once because it handles all chips. */
453 
si_cp_dma_prefetch(struct si_context * sctx,struct pipe_resource * buf,unsigned offset,unsigned size)454 void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
455                         unsigned offset, unsigned size)
456 {
457    switch (sctx->gfx_level) {
458    case GFX7:
459       si_cp_dma_prefetch_inline<GFX7>(sctx, buf, offset, size);
460       break;
461    case GFX8:
462       si_cp_dma_prefetch_inline<GFX8>(sctx, buf, offset, size);
463       break;
464    case GFX9:
465       si_cp_dma_prefetch_inline<GFX9>(sctx, buf, offset, size);
466       break;
467    case GFX10:
468       si_cp_dma_prefetch_inline<GFX10>(sctx, buf, offset, size);
469       break;
470    case GFX10_3:
471       si_cp_dma_prefetch_inline<GFX10_3>(sctx, buf, offset, size);
472       break;
473    case GFX11:
474       si_cp_dma_prefetch_inline<GFX11>(sctx, buf, offset, size);
475       break;
476    default:
477       break;
478    }
479 }
480 
481 #endif
482 
483 template<amd_gfx_level GFX_VERSION>
si_prefetch_shader_async(struct si_context * sctx,struct si_shader * shader)484 static void si_prefetch_shader_async(struct si_context *sctx, struct si_shader *shader)
485 {
486    struct pipe_resource *bo = &shader->bo->b.b;
487 
488    si_cp_dma_prefetch_inline<GFX_VERSION>(sctx, bo, 0, bo->width0);
489 }
490 
491 enum si_L2_prefetch_mode {
492    PREFETCH_BEFORE_DRAW = 1,
493    PREFETCH_AFTER_DRAW,
494    PREFETCH_ALL,
495 };
496 
497 /**
498  * Prefetch shaders.
499  */
500 template<amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
501          si_L2_prefetch_mode mode>
si_prefetch_shaders(struct si_context * sctx)502 static void si_prefetch_shaders(struct si_context *sctx)
503 {
504    unsigned mask = sctx->prefetch_L2_mask;
505 
506    /* GFX6 doesn't support the L2 prefetch. */
507    if (GFX_VERSION < GFX7 || !mask)
508       return;
509 
510    /* Prefetch shaders and VBO descriptors to TC L2. */
511    if (GFX_VERSION >= GFX11) {
512       if (HAS_TESS) {
513          if (mode != PREFETCH_AFTER_DRAW) {
514             if (mask & SI_PREFETCH_HS)
515                si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.hs);
516 
517             if (mode == PREFETCH_BEFORE_DRAW)
518                return;
519          }
520 
521          if (mask & SI_PREFETCH_GS)
522             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.gs);
523       } else if (mode != PREFETCH_AFTER_DRAW) {
524          if (mask & SI_PREFETCH_GS)
525             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.gs);
526 
527          if (mode == PREFETCH_BEFORE_DRAW)
528             return;
529       }
530    } else if (GFX_VERSION >= GFX9) {
531       if (HAS_TESS) {
532          if (mode != PREFETCH_AFTER_DRAW) {
533             if (mask & SI_PREFETCH_HS)
534                si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.hs);
535 
536             if (mode == PREFETCH_BEFORE_DRAW)
537                return;
538          }
539 
540          if ((HAS_GS || NGG) && mask & SI_PREFETCH_GS)
541             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.gs);
542          if (!NGG && mask & SI_PREFETCH_VS)
543             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.vs);
544       } else if (HAS_GS || NGG) {
545          if (mode != PREFETCH_AFTER_DRAW) {
546             if (mask & SI_PREFETCH_GS)
547                si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.gs);
548 
549             if (mode == PREFETCH_BEFORE_DRAW)
550                return;
551          }
552 
553          if (!NGG && mask & SI_PREFETCH_VS)
554             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.vs);
555       } else {
556          if (mode != PREFETCH_AFTER_DRAW) {
557             if (mask & SI_PREFETCH_VS)
558                si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.vs);
559 
560             if (mode == PREFETCH_BEFORE_DRAW)
561                return;
562          }
563       }
564    } else {
565       /* GFX6-GFX8 */
566       /* Choose the right spot for the VBO prefetch. */
567       if (HAS_TESS) {
568          if (mode != PREFETCH_AFTER_DRAW) {
569             if (mask & SI_PREFETCH_LS)
570                si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.ls);
571 
572             if (mode == PREFETCH_BEFORE_DRAW)
573                return;
574          }
575 
576          if (mask & SI_PREFETCH_HS)
577             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.hs);
578          if (mask & SI_PREFETCH_ES)
579             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.es);
580          if (mask & SI_PREFETCH_GS)
581             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.gs);
582          if (mask & SI_PREFETCH_VS)
583             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.vs);
584       } else if (HAS_GS) {
585          if (mode != PREFETCH_AFTER_DRAW) {
586             if (mask & SI_PREFETCH_ES)
587                si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.es);
588 
589             if (mode == PREFETCH_BEFORE_DRAW)
590                return;
591          }
592 
593          if (mask & SI_PREFETCH_GS)
594             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.gs);
595          if (mask & SI_PREFETCH_VS)
596             si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.vs);
597       } else {
598          if (mode != PREFETCH_AFTER_DRAW) {
599             if (mask & SI_PREFETCH_VS)
600                si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.vs);
601 
602             if (mode == PREFETCH_BEFORE_DRAW)
603                return;
604          }
605       }
606    }
607 
608    if (mask & SI_PREFETCH_PS)
609       si_prefetch_shader_async<GFX_VERSION>(sctx, sctx->queued.named.ps);
610 
611    /* This must be cleared only when AFTER_DRAW is true. */
612    sctx->prefetch_L2_mask = 0;
613 }
614 
615 /**
616  * This calculates the LDS size for tessellation shaders (VS, TCS, TES).
617  * LS.LDS_SIZE is shared by all 3 shader stages.
618  *
619  * The information about LDS and other non-compile-time parameters is then
620  * written to userdata SGPRs.
621  */
si_emit_derived_tess_state(struct si_context * sctx,unsigned * num_patches)622 static void si_emit_derived_tess_state(struct si_context *sctx, unsigned *num_patches)
623 {
624    struct si_shader *ls_current;
625    struct si_shader_selector *ls;
626    struct si_shader_selector *tcs = sctx->shader.tcs.cso;
627    unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
628    bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1;
629    unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
630    uint8_t num_tcs_input_cp = sctx->patch_vertices;
631 
632    /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
633    if (sctx->gfx_level >= GFX9) {
634       ls_current = sctx->shader.tcs.current;
635       ls = ls_current->key.ge.part.tcs.ls;
636    } else {
637       ls_current = sctx->shader.vs.current;
638       ls = sctx->shader.vs.cso;
639    }
640 
641    if (sctx->last_ls == ls_current && sctx->last_tcs == tcs &&
642        sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
643        (!has_primid_instancing_bug || (sctx->last_tess_uses_primid == tess_uses_primid))) {
644       *num_patches = sctx->last_num_patches;
645       return;
646    }
647 
648    sctx->last_ls = ls_current;
649    sctx->last_tcs = tcs;
650    sctx->last_tes_sh_base = tes_sh_base;
651    sctx->last_num_tcs_input_cp = num_tcs_input_cp;
652    sctx->last_tess_uses_primid = tess_uses_primid;
653 
654    /* This calculates how shader inputs and outputs among VS, TCS, and TES
655     * are laid out in LDS. */
656    unsigned num_tcs_outputs = util_last_bit64(tcs->info.outputs_written);
657    unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
658    unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written);
659 
660    unsigned input_vertex_size = ls->info.lshs_vertex_stride;
661    unsigned output_vertex_size = num_tcs_outputs * 16;
662    unsigned input_patch_size;
663 
664    /* Allocate LDS for TCS inputs only if it's used. */
665    if (!ls_current->key.ge.opt.same_patch_vertices ||
666        tcs->info.base.inputs_read & ~tcs->info.tcs_vgpr_only_inputs)
667       input_patch_size = num_tcs_input_cp * input_vertex_size;
668    else
669       input_patch_size = 0;
670 
671    unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
672    unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
673    unsigned lds_per_patch;
674 
675    /* Compute the LDS size per patch.
676     *
677     * LDS is used to store TCS outputs if they are read, and to store tess
678     * factors if they are not defined in all invocations.
679     */
680    if (tcs->info.base.outputs_read ||
681        tcs->info.base.patch_outputs_read ||
682        !tcs->info.tessfactors_are_def_in_all_invocs) {
683       lds_per_patch = input_patch_size + output_patch_size;
684    } else {
685       /* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */
686       lds_per_patch = MAX2(input_patch_size, output_patch_size);
687    }
688 
689    /* Ensure that we only need 4 waves per CU, so that we don't need to check
690     * resource usage (such as whether we have enough VGPRs to fit the whole
691     * threadgroup into the CU). It also ensures that the number of tcs in and out
692     * vertices per threadgroup are at most 256, which is the hw limit.
693     */
694    unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
695    *num_patches = 256 / max_verts_per_patch;
696 
697    /* Not necessary for correctness, but higher numbers are slower.
698     * The hardware can do more, but the radeonsi shader constant is
699     * limited to 6 bits.
700     */
701    *num_patches = MIN2(*num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */
702 
703    /* When distributed tessellation is unsupported, switch between SEs
704     * at a higher frequency to manually balance the workload between SEs.
705     */
706    if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
707       *num_patches = MIN2(*num_patches, 16); /* recommended */
708 
709    /* Make sure the output data fits in the offchip buffer */
710    *num_patches =
711       MIN2(*num_patches, (sctx->screen->hs.tess_offchip_block_dw_size * 4) / output_patch_size);
712 
713    /* Make sure that the data fits in LDS. This assumes the shaders only
714     * use LDS for the inputs and outputs.
715     *
716     * The maximum allowed LDS size is 32K. Higher numbers can hang.
717     * Use 16K as the maximum, so that we can fit 2 workgroups on the same CU.
718     */
719    ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */
720    unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */
721    *num_patches = MIN2(*num_patches, target_lds_size / lds_per_patch);
722    *num_patches = MAX2(*num_patches, 1);
723    assert(*num_patches * lds_per_patch <= max_lds_size);
724 
725    /* Make sure that vector lanes are fully occupied by cutting off the last wave
726     * if it's only partially filled.
727     */
728    unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
729    unsigned wave_size = ls_current->wave_size;
730 
731    if (temp_verts_per_tg > wave_size &&
732        (wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))
733       *num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
734 
735    if (sctx->gfx_level == GFX6) {
736       /* GFX6 bug workaround, related to power management. Limit LS-HS
737        * threadgroups to only one wave.
738        */
739       unsigned one_wave = wave_size / max_verts_per_patch;
740       *num_patches = MIN2(*num_patches, one_wave);
741    }
742 
743    /* The VGT HS block increments the patch ID unconditionally
744     * within a single threadgroup. This results in incorrect
745     * patch IDs when instanced draws are used.
746     *
747     * The intended solution is to restrict threadgroups to
748     * a single instance by setting SWITCH_ON_EOI, which
749     * should cause IA to split instances up. However, this
750     * doesn't work correctly on GFX6 when there is no other
751     * SE to switch to.
752     */
753    if (has_primid_instancing_bug && tess_uses_primid)
754       *num_patches = 1;
755 
756    sctx->last_num_patches = *num_patches;
757 
758    unsigned output_patch0_offset = input_patch_size * *num_patches;
759    unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
760 
761    /* Compute userdata SGPRs. */
762    assert(((input_vertex_size / 4) & ~0xff) == 0);
763    assert(((output_vertex_size / 4) & ~0xff) == 0);
764    assert(((input_patch_size / 4) & ~0x1fff) == 0);
765    assert(((output_patch_size / 4) & ~0x1fff) == 0);
766    assert(((output_patch0_offset / 4) & ~0xffff) == 0);
767    assert(((perpatch_output_offset / 4) & ~0xffff) == 0);
768    assert(num_tcs_input_cp <= 32);
769    assert(num_tcs_output_cp <= 32);
770    assert(*num_patches <= 64);
771    assert(((pervertex_output_patch_size * *num_patches) & ~0x1fffff) == 0);
772 
773    uint64_t ring_va = (unlikely(sctx->ws->cs_is_secure(&sctx->gfx_cs)) ?
774       si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;
775    assert((ring_va & u_bit_consecutive(0, 19)) == 0);
776 
777    unsigned tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
778    unsigned tcs_out_offsets = (output_patch0_offset / 4) | ((perpatch_output_offset / 4) << 16);
779    unsigned offchip_layout =
780       (*num_patches - 1) | ((num_tcs_output_cp - 1) << 6) |
781       ((pervertex_output_patch_size * *num_patches) << 11);
782 
783    /* Compute the LDS size. */
784    unsigned lds_size = lds_per_patch * *num_patches;
785 
786    if (sctx->gfx_level >= GFX7) {
787       assert(lds_size <= 65536);
788       lds_size = align(lds_size, 512) / 512;
789    } else {
790       assert(lds_size <= 32768);
791       lds_size = align(lds_size, 256) / 256;
792    }
793 
794    /* Set SI_SGPR_VS_STATE_BITS. */
795    SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_PATCH_SIZE, input_patch_size / 4);
796    SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_VERTEX_SIZE, input_vertex_size / 4);
797 
798    /* We should be able to support in-shader LDS use with LLVM >= 9
799     * by just adding the lds_sizes together, but it has never
800     * been tested. */
801    assert(ls_current->config.lds_size == 0);
802 
803    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
804    radeon_begin(cs);
805 
806    if (sctx->gfx_level >= GFX9) {
807       unsigned hs_rsrc2 = ls_current->config.rsrc2;
808 
809       if (sctx->gfx_level >= GFX10)
810          hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(lds_size);
811       else
812          hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
813 
814       radeon_set_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
815 
816       /* Set userdata SGPRs for merged LS-HS. */
817       radeon_set_sh_reg_seq(
818          R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
819       radeon_emit(offchip_layout);
820       radeon_emit(tcs_out_offsets);
821       radeon_emit(tcs_out_layout);
822    } else {
823       unsigned ls_rsrc2 = ls_current->config.rsrc2;
824 
825       si_multiwave_lds_size_workaround(sctx->screen, &lds_size);
826       ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size);
827 
828       /* Due to a hw bug, RSRC2_LS must be written twice with another
829        * LS register written in between. */
830       if (sctx->gfx_level == GFX7 && sctx->family != CHIP_HAWAII)
831          radeon_set_sh_reg(R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
832       radeon_set_sh_reg_seq(R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
833       radeon_emit(ls_current->config.rsrc1);
834       radeon_emit(ls_rsrc2);
835 
836       /* Set userdata SGPRs for TCS. */
837       radeon_set_sh_reg_seq(
838          R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
839       radeon_emit(offchip_layout);
840       radeon_emit(tcs_out_offsets);
841       radeon_emit(tcs_out_layout);
842       radeon_emit(sctx->current_vs_state);
843    }
844 
845    /* Set userdata SGPRs for TES. */
846    radeon_set_sh_reg_seq(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
847    radeon_emit(offchip_layout);
848    radeon_emit(ring_va);
849    radeon_end();
850 
851    unsigned ls_hs_config =
852          S_028B58_NUM_PATCHES(*num_patches) |
853          S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
854          S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
855 
856    if (sctx->last_ls_hs_config != ls_hs_config) {
857       radeon_begin(cs);
858       if (sctx->gfx_level >= GFX7) {
859          radeon_set_context_reg_idx(R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
860       } else {
861          radeon_set_context_reg(R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
862       }
863       radeon_end_update_context_roll(sctx);
864       sctx->last_ls_hs_config = ls_hs_config;
865    }
866 }
867 
si_num_prims_for_vertices(enum pipe_prim_type prim,unsigned count,unsigned vertices_per_patch)868 static unsigned si_num_prims_for_vertices(enum pipe_prim_type prim,
869                                           unsigned count, unsigned vertices_per_patch)
870 {
871    switch (prim) {
872    case PIPE_PRIM_PATCHES:
873       return count / vertices_per_patch;
874    case PIPE_PRIM_POLYGON:
875       /* It's a triangle fan with different edge flags. */
876       return count >= 3 ? count - 2 : 0;
877    case SI_PRIM_RECTANGLE_LIST:
878       return count / 3;
879    default:
880       return u_decomposed_prims_for_vertices(prim, count);
881    }
882 }
883 
si_get_init_multi_vgt_param(struct si_screen * sscreen,union si_vgt_param_key * key)884 static unsigned si_get_init_multi_vgt_param(struct si_screen *sscreen, union si_vgt_param_key *key)
885 {
886    STATIC_ASSERT(sizeof(union si_vgt_param_key) == 2);
887    unsigned max_primgroup_in_wave = 2;
888 
889    /* SWITCH_ON_EOP(0) is always preferable. */
890    bool wd_switch_on_eop = false;
891    bool ia_switch_on_eop = false;
892    bool ia_switch_on_eoi = false;
893    bool partial_vs_wave = false;
894    bool partial_es_wave = false;
895 
896    if (key->u.uses_tess) {
897       /* SWITCH_ON_EOI must be set if PrimID is used. */
898       if (key->u.tess_uses_prim_id)
899          ia_switch_on_eoi = true;
900 
901       /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
902       if ((sscreen->info.family == CHIP_TAHITI || sscreen->info.family == CHIP_PITCAIRN ||
903            sscreen->info.family == CHIP_BONAIRE) &&
904           key->u.uses_gs)
905          partial_vs_wave = true;
906 
907       /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= GFX8) */
908       if (sscreen->info.has_distributed_tess) {
909          if (key->u.uses_gs) {
910             if (sscreen->info.gfx_level == GFX8)
911                partial_es_wave = true;
912          } else {
913             partial_vs_wave = true;
914          }
915       }
916    }
917 
918    /* This is a hardware requirement. */
919    if (key->u.line_stipple_enabled || (sscreen->debug_flags & DBG(SWITCH_ON_EOP))) {
920       ia_switch_on_eop = true;
921       wd_switch_on_eop = true;
922    }
923 
924    if (sscreen->info.gfx_level >= GFX7) {
925       /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
926        * 4 shader engines. Set 1 to pass the assertion below.
927        * The other cases are hardware requirements.
928        *
929        * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0
930        * for points, line strips, and tri strips.
931        */
932       if (sscreen->info.max_se <= 2 || key->u.prim == PIPE_PRIM_POLYGON ||
933           key->u.prim == PIPE_PRIM_LINE_LOOP || key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||
934           key->u.prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
935           (key->u.primitive_restart &&
936            (sscreen->info.family < CHIP_POLARIS10 ||
937             (key->u.prim != PIPE_PRIM_POINTS && key->u.prim != PIPE_PRIM_LINE_STRIP &&
938              key->u.prim != PIPE_PRIM_TRIANGLE_STRIP))) ||
939           key->u.count_from_stream_output)
940          wd_switch_on_eop = true;
941 
942       /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
943        * We don't know that for indirect drawing, so treat it as
944        * always problematic. */
945       if (sscreen->info.family == CHIP_HAWAII && key->u.uses_instancing)
946          wd_switch_on_eop = true;
947 
948       /* Performance recommendation for 4 SE Gfx7-8 parts if
949        * instances are smaller than a primgroup.
950        * Assume indirect draws always use small instances.
951        * This is needed for good VS wave utilization.
952        */
953       if (sscreen->info.gfx_level <= GFX8 && sscreen->info.max_se == 4 &&
954           key->u.multi_instances_smaller_than_primgroup)
955          wd_switch_on_eop = true;
956 
957       /* Required on GFX7 and later. */
958       if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
959          ia_switch_on_eoi = true;
960 
961       /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
962        * to work around a GS hang.
963        */
964       if (key->u.uses_gs &&
965           (sscreen->info.family == CHIP_TONGA || sscreen->info.family == CHIP_FIJI ||
966            sscreen->info.family == CHIP_POLARIS10 || sscreen->info.family == CHIP_POLARIS11 ||
967            sscreen->info.family == CHIP_POLARIS12 || sscreen->info.family == CHIP_VEGAM))
968          partial_vs_wave = true;
969 
970       /* Required by Hawaii and, for some special cases, by GFX8. */
971       if (ia_switch_on_eoi &&
972           (sscreen->info.family == CHIP_HAWAII ||
973            (sscreen->info.gfx_level == GFX8 && (key->u.uses_gs || max_primgroup_in_wave != 2))))
974          partial_vs_wave = true;
975 
976       /* Instancing bug on Bonaire. */
977       if (sscreen->info.family == CHIP_BONAIRE && ia_switch_on_eoi && key->u.uses_instancing)
978          partial_vs_wave = true;
979 
980       /* This only applies to Polaris10 and later 4 SE chips.
981        * wd_switch_on_eop is already true on all other chips.
982        */
983       if (!wd_switch_on_eop && key->u.primitive_restart)
984          partial_vs_wave = true;
985 
986       /* If the WD switch is false, the IA switch must be false too. */
987       assert(wd_switch_on_eop || !ia_switch_on_eop);
988    }
989 
990    /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
991    if (sscreen->info.gfx_level <= GFX8 && ia_switch_on_eoi)
992       partial_es_wave = true;
993 
994    return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
995           S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
996           S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
997           S_028AA8_WD_SWITCH_ON_EOP(sscreen->info.gfx_level >= GFX7 ? wd_switch_on_eop : 0) |
998           /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
999           S_028AA8_MAX_PRIMGRP_IN_WAVE(sscreen->info.gfx_level == GFX8 ? max_primgroup_in_wave
1000                                                                         : 0) |
1001           S_030960_EN_INST_OPT_BASIC(sscreen->info.gfx_level >= GFX9) |
1002           S_030960_EN_INST_OPT_ADV(sscreen->info.gfx_level >= GFX9);
1003 }
1004 
si_init_ia_multi_vgt_param_table(struct si_context * sctx)1005 static void si_init_ia_multi_vgt_param_table(struct si_context *sctx)
1006 {
1007    for (int prim = 0; prim <= SI_PRIM_RECTANGLE_LIST; prim++)
1008       for (int uses_instancing = 0; uses_instancing < 2; uses_instancing++)
1009          for (int multi_instances = 0; multi_instances < 2; multi_instances++)
1010             for (int primitive_restart = 0; primitive_restart < 2; primitive_restart++)
1011                for (int count_from_so = 0; count_from_so < 2; count_from_so++)
1012                   for (int line_stipple = 0; line_stipple < 2; line_stipple++)
1013                      for (int uses_tess = 0; uses_tess < 2; uses_tess++)
1014                         for (int tess_uses_primid = 0; tess_uses_primid < 2; tess_uses_primid++)
1015                            for (int uses_gs = 0; uses_gs < 2; uses_gs++) {
1016                               union si_vgt_param_key key;
1017 
1018                               key.index = 0;
1019                               key.u.prim = prim;
1020                               key.u.uses_instancing = uses_instancing;
1021                               key.u.multi_instances_smaller_than_primgroup = multi_instances;
1022                               key.u.primitive_restart = primitive_restart;
1023                               key.u.count_from_stream_output = count_from_so;
1024                               key.u.line_stipple_enabled = line_stipple;
1025                               key.u.uses_tess = uses_tess;
1026                               key.u.tess_uses_prim_id = tess_uses_primid;
1027                               key.u.uses_gs = uses_gs;
1028 
1029                               sctx->ia_multi_vgt_param[key.index] =
1030                                  si_get_init_multi_vgt_param(sctx->screen, &key);
1031                            }
1032 }
1033 
si_is_line_stipple_enabled(struct si_context * sctx)1034 static bool si_is_line_stipple_enabled(struct si_context *sctx)
1035 {
1036    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
1037 
1038    return rs->line_stipple_enable && sctx->current_rast_prim != PIPE_PRIM_POINTS &&
1039           (rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim));
1040 }
1041 
1042 enum si_is_draw_vertex_state {
1043    DRAW_VERTEX_STATE_OFF,
1044    DRAW_VERTEX_STATE_ON,
1045 };
1046 
1047 template <si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
num_instanced_prims_less_than(const struct pipe_draw_indirect_info * indirect,enum pipe_prim_type prim,unsigned min_vertex_count,unsigned instance_count,unsigned num_prims,ubyte vertices_per_patch)1048 static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *indirect,
1049                                           enum pipe_prim_type prim,
1050                                           unsigned min_vertex_count,
1051                                           unsigned instance_count,
1052                                           unsigned num_prims,
1053                                           ubyte vertices_per_patch)
1054 {
1055    if (IS_DRAW_VERTEX_STATE)
1056       return 0;
1057 
1058    if (indirect) {
1059       return indirect->buffer ||
1060              (instance_count > 1 && indirect->count_from_stream_output);
1061    } else {
1062       return instance_count > 1 &&
1063              si_num_prims_for_vertices(prim, min_vertex_count, vertices_per_patch) < num_prims;
1064    }
1065 }
1066 
1067 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
1068           si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
si_get_ia_multi_vgt_param(struct si_context * sctx,const struct pipe_draw_indirect_info * indirect,enum pipe_prim_type prim,unsigned num_patches,unsigned instance_count,bool primitive_restart,unsigned min_vertex_count)1069 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
1070                                           const struct pipe_draw_indirect_info *indirect,
1071                                           enum pipe_prim_type prim, unsigned num_patches,
1072                                           unsigned instance_count, bool primitive_restart,
1073                                           unsigned min_vertex_count)
1074 {
1075    union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
1076    unsigned primgroup_size;
1077    unsigned ia_multi_vgt_param;
1078 
1079    if (HAS_TESS) {
1080       primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
1081    } else if (HAS_GS) {
1082       primgroup_size = 64; /* recommended with a GS */
1083    } else {
1084       primgroup_size = 128; /* recommended without a GS and tess */
1085    }
1086 
1087    key.u.prim = prim;
1088    key.u.uses_instancing = !IS_DRAW_VERTEX_STATE &&
1089                            ((indirect && indirect->buffer) || instance_count > 1);
1090    key.u.multi_instances_smaller_than_primgroup =
1091       num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
1092                                                           instance_count, primgroup_size,
1093                                                           sctx->patch_vertices);
1094    key.u.primitive_restart = !IS_DRAW_VERTEX_STATE && primitive_restart;
1095    key.u.count_from_stream_output = !IS_DRAW_VERTEX_STATE && indirect &&
1096                                     indirect->count_from_stream_output;
1097    key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
1098 
1099    ia_multi_vgt_param =
1100       sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
1101 
1102    if (HAS_GS) {
1103       /* GS requirement. */
1104       if (GFX_VERSION <= GFX8 &&
1105           SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
1106          ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
1107 
1108       /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
1109        * The hw doc says all multi-SE chips are affected, but Vulkan
1110        * only applies it to Hawaii. Do what Vulkan does.
1111        */
1112       if (GFX_VERSION == GFX7 &&
1113           sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
1114           num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
1115                                                               instance_count, 2, sctx->patch_vertices))
1116          sctx->flags |= SI_CONTEXT_VGT_FLUSH;
1117    }
1118 
1119    return ia_multi_vgt_param;
1120 }
1121 
1122 ALWAYS_INLINE
si_conv_prim_to_gs_out(unsigned mode)1123 static unsigned si_conv_prim_to_gs_out(unsigned mode)
1124 {
1125    static const int prim_conv[] = {
1126       [PIPE_PRIM_POINTS] = V_028A6C_POINTLIST,
1127       [PIPE_PRIM_LINES] = V_028A6C_LINESTRIP,
1128       [PIPE_PRIM_LINE_LOOP] = V_028A6C_LINESTRIP,
1129       [PIPE_PRIM_LINE_STRIP] = V_028A6C_LINESTRIP,
1130       [PIPE_PRIM_TRIANGLES] = V_028A6C_TRISTRIP,
1131       [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_TRISTRIP,
1132       [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_TRISTRIP,
1133       [PIPE_PRIM_QUADS] = V_028A6C_TRISTRIP,
1134       [PIPE_PRIM_QUAD_STRIP] = V_028A6C_TRISTRIP,
1135       [PIPE_PRIM_POLYGON] = V_028A6C_TRISTRIP,
1136       [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_LINESTRIP,
1137       [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_LINESTRIP,
1138       [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_TRISTRIP,
1139       [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_TRISTRIP,
1140       [PIPE_PRIM_PATCHES] = V_028A6C_POINTLIST,
1141       [SI_PRIM_RECTANGLE_LIST] = V_028A6C_RECTLIST,
1142    };
1143    assert(mode < ARRAY_SIZE(prim_conv));
1144 
1145    return prim_conv[mode];
1146 }
1147 
1148 /* rast_prim is the primitive type after GS. */
1149 template<amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
si_emit_rasterizer_prim_state(struct si_context * sctx)1150 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
1151 {
1152    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1153    enum pipe_prim_type rast_prim = sctx->current_rast_prim;
1154    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
1155 
1156    radeon_begin(cs);
1157 
1158    if (unlikely(si_is_line_stipple_enabled(sctx))) {
1159       /* For lines, reset the stipple pattern at each primitive. Otherwise,
1160        * reset the stipple pattern at each packet (line strips, line loops).
1161        */
1162       bool reset_per_prim = rast_prim == PIPE_PRIM_LINES ||
1163                             rast_prim == PIPE_PRIM_LINES_ADJACENCY;
1164       /* 0 = no reset, 1 = reset per prim, 2 = reset per packet */
1165       unsigned value =
1166          rs->pa_sc_line_stipple | S_028A0C_AUTO_RESET_CNTL(reset_per_prim ? 1 : 2);
1167 
1168       radeon_opt_set_context_reg(sctx, R_028A0C_PA_SC_LINE_STIPPLE, SI_TRACKED_PA_SC_LINE_STIPPLE,
1169                                  value);
1170    }
1171 
1172    unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
1173    if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (NGG || HAS_GS))) {
1174       if (GFX_VERSION >= GFX11)
1175          radeon_set_uconfig_reg(R_030998_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
1176       else
1177          radeon_set_context_reg(R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
1178       sctx->last_gs_out_prim = gs_out_prim;
1179    }
1180 
1181    if (GFX_VERSION == GFX9)
1182       radeon_end_update_context_roll(sctx);
1183    else
1184       radeon_end();
1185 
1186    if (NGG) {
1187       struct si_shader *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current;
1188 
1189       if (hw_vs->uses_vs_state_provoking_vertex) {
1190          unsigned vtx_index = rs->flatshade_first ? 0 : gs_out_prim;
1191 
1192          SET_FIELD(sctx->current_gs_state, GS_STATE_PROVOKING_VTX_INDEX, vtx_index);
1193       }
1194 
1195       if (hw_vs->uses_gs_state_outprim) {
1196          SET_FIELD(sctx->current_gs_state, GS_STATE_OUTPRIM, gs_out_prim);
1197       }
1198    }
1199 }
1200 
1201 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
1202           si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
si_emit_vs_state(struct si_context * sctx,unsigned index_size)1203 static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
1204 {
1205    if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) {
1206       /* Re-emit the state after we leave u_blitter. */
1207       sctx->last_vs_state = ~0;
1208       sctx->last_gs_state = ~0;
1209       return;
1210    }
1211 
1212    unsigned vs_state = sctx->current_vs_state; /* all VS bits including LS bits */
1213    unsigned gs_state = sctx->current_gs_state; /* only GS and NGG bits; VS bits will be copied here */
1214 
1215    if (sctx->shader.vs.cso->info.uses_base_vertex && index_size)
1216       vs_state |= ENCODE_FIELD(VS_STATE_INDEXED, 1);
1217 
1218    /* Copy all state bits from vs_state to gs_state except the LS bits. */
1219    gs_state |= vs_state &
1220                CLEAR_FIELD(VS_STATE_LS_OUT_PATCH_SIZE) &
1221                CLEAR_FIELD(VS_STATE_LS_OUT_VERTEX_SIZE);
1222 
1223    if (vs_state != sctx->last_vs_state ||
1224        ((HAS_GS || NGG) && gs_state != sctx->last_gs_state)) {
1225       struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1226 
1227       /* These are all constant expressions. */
1228       unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
1229                                                PIPE_SHADER_VERTEX);
1230       unsigned tes_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
1231                                                 PIPE_SHADER_TESS_EVAL);
1232       unsigned gs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
1233                                                PIPE_SHADER_GEOMETRY);
1234       unsigned gs_copy_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
1235 
1236       radeon_begin(cs);
1237       if (HAS_GS) {
1238          radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state);
1239 
1240          /* NGG always uses the state bits. Legacy GS uses the state bits only for the emulation
1241           * of GS pipeline statistics on gfx10.x.
1242           */
1243          if (NGG || (GFX_VERSION >= GFX10 && GFX_VERSION <= GFX10_3))
1244             radeon_set_sh_reg(gs_base + SI_SGPR_VS_STATE_BITS * 4, gs_state);
1245 
1246          /* The GS copy shader (for legacy GS) always uses the state bits. */
1247          if (!NGG)
1248             radeon_set_sh_reg(gs_copy_base + SI_SGPR_VS_STATE_BITS * 4, gs_state);
1249       } else if (HAS_TESS) {
1250          radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state);
1251          radeon_set_sh_reg(tes_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state);
1252       } else {
1253          radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state);
1254       }
1255       radeon_end();
1256 
1257       sctx->last_vs_state = vs_state;
1258       if (HAS_GS || NGG)
1259          sctx->last_gs_state = gs_state;
1260    }
1261 }
1262 
1263 ALWAYS_INLINE
si_prim_restart_index_changed(struct si_context * sctx,bool primitive_restart,unsigned restart_index)1264 static bool si_prim_restart_index_changed(struct si_context *sctx, bool primitive_restart,
1265                                           unsigned restart_index)
1266 {
1267    return primitive_restart && (restart_index != sctx->last_restart_index ||
1268                                 sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
1269 }
1270 
1271 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
1272           si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
si_emit_ia_multi_vgt_param(struct si_context * sctx,const struct pipe_draw_indirect_info * indirect,enum pipe_prim_type prim,unsigned num_patches,unsigned instance_count,bool primitive_restart,unsigned min_vertex_count)1273 static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
1274                                        const struct pipe_draw_indirect_info *indirect,
1275                                        enum pipe_prim_type prim, unsigned num_patches,
1276                                        unsigned instance_count, bool primitive_restart,
1277                                        unsigned min_vertex_count)
1278 {
1279    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1280    unsigned ia_multi_vgt_param;
1281 
1282    ia_multi_vgt_param =
1283       si_get_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS, IS_DRAW_VERTEX_STATE>
1284          (sctx, indirect, prim, num_patches, instance_count, primitive_restart,
1285           min_vertex_count);
1286 
1287    /* Draw state. */
1288    if (ia_multi_vgt_param != sctx->last_multi_vgt_param ||
1289        /* Workaround for SpecviewPerf13 Catia hang on GFX9. */
1290        (GFX_VERSION == GFX9 && prim != sctx->last_prim)) {
1291       radeon_begin(cs);
1292 
1293       if (GFX_VERSION == GFX9)
1294          radeon_set_uconfig_reg_idx(sctx->screen, GFX_VERSION,
1295                                     R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
1296       else if (GFX_VERSION >= GFX7)
1297          radeon_set_context_reg_idx(R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
1298       else
1299          radeon_set_context_reg(R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
1300 
1301       radeon_end();
1302 
1303       sctx->last_multi_vgt_param = ia_multi_vgt_param;
1304    }
1305 }
1306 
1307 /* GFX10 removed IA_MULTI_VGT_PARAM in exchange for GE_CNTL.
1308  * We overload last_multi_vgt_param.
1309  */
1310 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
gfx10_emit_ge_cntl(struct si_context * sctx,unsigned num_patches)1311 static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
1312 {
1313    union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
1314    unsigned ge_cntl;
1315 
1316    if (NGG) {
1317       if (HAS_TESS) {
1318          if (GFX_VERSION >= GFX11) {
1319             unsigned prim_grp_size =
1320                G_03096C_PRIM_GRP_SIZE_GFX11(si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ge_cntl);
1321 
1322             ge_cntl = S_03096C_PRIMS_PER_SUBGRP(num_patches) |
1323                       S_03096C_VERTS_PER_SUBGRP(si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ngg.hw_max_esverts) |
1324                       S_03096C_BREAK_PRIMGRP_AT_EOI(key.u.tess_uses_prim_id) |
1325                       S_03096C_PRIM_GRP_SIZE_GFX11(prim_grp_size);
1326          } else {
1327             ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(num_patches) |
1328                       S_03096C_VERT_GRP_SIZE(0) |
1329                       S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
1330          }
1331       } else {
1332          ge_cntl = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ge_cntl;
1333       }
1334    } else {
1335       unsigned primgroup_size;
1336       unsigned vertgroup_size;
1337       assert(GFX_VERSION < GFX11);
1338 
1339       if (HAS_TESS) {
1340          primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
1341          vertgroup_size = 0;
1342       } else if (HAS_GS) {
1343          unsigned vgt_gs_onchip_cntl = sctx->shader.gs.current->ctx_reg.gs.vgt_gs_onchip_cntl;
1344          primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
1345          vertgroup_size = G_028A44_ES_VERTS_PER_SUBGRP(vgt_gs_onchip_cntl);
1346       } else {
1347          primgroup_size = 128; /* recommended without a GS and tess */
1348          vertgroup_size = 0;
1349       }
1350 
1351       ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(primgroup_size) |
1352                 S_03096C_VERT_GRP_SIZE(vertgroup_size) |
1353                 S_03096C_BREAK_WAVE_AT_EOI(key.u.uses_tess && key.u.tess_uses_prim_id);
1354    }
1355 
1356    ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));
1357 
1358    if (ge_cntl != sctx->last_multi_vgt_param) {
1359       struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1360 
1361       radeon_begin(cs);
1362       radeon_set_uconfig_reg(R_03096C_GE_CNTL, ge_cntl);
1363       radeon_end();
1364       sctx->last_multi_vgt_param = ge_cntl;
1365    }
1366 }
1367 
1368 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
1369           si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
si_emit_draw_registers(struct si_context * sctx,const struct pipe_draw_indirect_info * indirect,enum pipe_prim_type prim,unsigned num_patches,unsigned instance_count,bool primitive_restart,unsigned restart_index,unsigned min_vertex_count)1370 static void si_emit_draw_registers(struct si_context *sctx,
1371                                    const struct pipe_draw_indirect_info *indirect,
1372                                    enum pipe_prim_type prim, unsigned num_patches,
1373                                    unsigned instance_count, bool primitive_restart,
1374                                    unsigned restart_index, unsigned min_vertex_count)
1375 {
1376    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1377 
1378    if (IS_DRAW_VERTEX_STATE)
1379       primitive_restart = false;
1380 
1381    if (GFX_VERSION >= GFX10)
1382       gfx10_emit_ge_cntl<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, num_patches);
1383    else
1384       si_emit_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS, IS_DRAW_VERTEX_STATE>
1385          (sctx, indirect, prim, num_patches, instance_count, primitive_restart,
1386           min_vertex_count);
1387 
1388    radeon_begin(cs);
1389 
1390    if (prim != sctx->last_prim) {
1391       unsigned vgt_prim = si_conv_pipe_prim(prim);
1392 
1393       if (GFX_VERSION >= GFX10)
1394          radeon_set_uconfig_reg(R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
1395       else if (GFX_VERSION >= GFX7)
1396          radeon_set_uconfig_reg_idx(sctx->screen, GFX_VERSION, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
1397       else
1398          radeon_set_config_reg(R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
1399 
1400       sctx->last_prim = prim;
1401    }
1402 
1403    /* Primitive restart. */
1404    if (primitive_restart != sctx->last_primitive_restart_en) {
1405       if (GFX_VERSION >= GFX11)
1406          radeon_set_uconfig_reg(R_03092C_GE_MULTI_PRIM_IB_RESET_EN, primitive_restart);
1407       else if (GFX_VERSION >= GFX9)
1408          radeon_set_uconfig_reg(R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
1409       else
1410          radeon_set_context_reg(R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
1411       sctx->last_primitive_restart_en = primitive_restart;
1412    }
1413    if (si_prim_restart_index_changed(sctx, primitive_restart, restart_index)) {
1414       radeon_set_context_reg(R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, restart_index);
1415       sctx->last_restart_index = restart_index;
1416       if (GFX_VERSION == GFX9)
1417          sctx->context_roll = true;
1418    }
1419    radeon_end();
1420 }
1421 
1422 #define EMIT_SQTT_END_DRAW do {                                          \
1423       if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace_enabled)) { \
1424          radeon_begin(&sctx->gfx_cs);                                    \
1425          radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));       \
1426          radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) |          \
1427                      EVENT_INDEX(0));                                    \
1428          radeon_end();                                      \
1429       }                                                                  \
1430    } while (0)
1431 
1432 template <amd_gfx_level GFX_VERSION, si_has_ngg NGG, si_is_draw_vertex_state IS_DRAW_VERTEX_STATE>
1433 ALWAYS_INLINE
si_emit_draw_packets(struct si_context * sctx,const struct pipe_draw_info * info,unsigned drawid_base,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws,struct pipe_resource * indexbuf,unsigned index_size,unsigned index_offset,unsigned instance_count)1434 static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
1435                                  unsigned drawid_base,
1436                                  const struct pipe_draw_indirect_info *indirect,
1437                                  const struct pipe_draw_start_count_bias *draws,
1438                                  unsigned num_draws,
1439                                  struct pipe_resource *indexbuf, unsigned index_size,
1440                                  unsigned index_offset, unsigned instance_count)
1441 {
1442    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1443 
1444    if (unlikely(sctx->thread_trace_enabled)) {
1445       si_sqtt_write_event_marker(sctx, &sctx->gfx_cs, sctx->sqtt_next_event,
1446                                  UINT_MAX, UINT_MAX, UINT_MAX);
1447    }
1448 
1449    uint32_t use_opaque = 0;
1450 
1451    if (!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) {
1452       struct si_streamout_target *t = (struct si_streamout_target *)indirect->count_from_stream_output;
1453 
1454       radeon_begin(cs);
1455       radeon_set_context_reg(R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);
1456       radeon_end();
1457 
1458       si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL,
1459                       R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM,
1460                       t->buf_filled_size, t->buf_filled_size_offset);
1461       use_opaque = S_0287F0_USE_OPAQUE(1);
1462       indirect = NULL;
1463    }
1464 
1465    uint32_t index_max_size = 0;
1466    uint64_t index_va = 0;
1467    bool disable_instance_packing = false;
1468 
1469    radeon_begin(cs);
1470 
1471    if (GFX_VERSION == GFX10_3) {
1472       /* Workaround for incorrect stats with adjacent primitive types
1473        * (see PAL's waDisableInstancePacking).
1474        */
1475       if (sctx->num_pipeline_stat_queries &&
1476           sctx->shader.gs.cso == NULL &&
1477           (instance_count > 1 || indirect) &&
1478           (1 << info->mode) & (1 << PIPE_PRIM_LINES_ADJACENCY |
1479                                1 << PIPE_PRIM_LINE_STRIP_ADJACENCY |
1480                                1 << PIPE_PRIM_TRIANGLES_ADJACENCY |
1481                                1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)) {
1482          disable_instance_packing = true;
1483       }
1484    }
1485 
1486    /* draw packet */
1487    if (index_size) {
1488       /* Register shadowing doesn't shadow INDEX_TYPE. */
1489       if (index_size != sctx->last_index_size || sctx->shadowed_regs ||
1490           (GFX_VERSION == GFX10_3 && disable_instance_packing != sctx->disable_instance_packing)) {
1491          unsigned index_type;
1492 
1493          /* Index type computation. When we look at how we need to translate index_size,
1494           * we can see that we just need 2 shifts to get the hw value.
1495           *
1496           * 1 = 001b --> 10b = 2
1497           * 2 = 010b --> 00b = 0
1498           * 4 = 100b --> 01b = 1
1499           */
1500          index_type = (((index_size >> 2) | (index_size << 1)) & 0x3) |
1501                       S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
1502 
1503          if (GFX_VERSION <= GFX7 && SI_BIG_ENDIAN) {
1504             /* GFX7 doesn't support ubyte indices. */
1505             index_type |= index_size == 2 ? V_028A7C_VGT_DMA_SWAP_16_BIT
1506                                           : V_028A7C_VGT_DMA_SWAP_32_BIT;
1507          }
1508 
1509          if (GFX_VERSION >= GFX9) {
1510             radeon_set_uconfig_reg_idx(sctx->screen, GFX_VERSION,
1511                                        R_03090C_VGT_INDEX_TYPE, 2, index_type);
1512          } else {
1513             radeon_emit(PKT3(PKT3_INDEX_TYPE, 0, 0));
1514             radeon_emit(index_type);
1515          }
1516 
1517          sctx->last_index_size = index_size;
1518          if (GFX_VERSION == GFX10_3)
1519             sctx->disable_instance_packing = disable_instance_packing;
1520       }
1521 
1522       index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(index_size);
1523       /* Skip draw calls with 0-sized index buffers.
1524        * They cause a hang on some chips, like Navi10-14.
1525        */
1526       if (!index_max_size) {
1527          radeon_end();
1528          return;
1529       }
1530 
1531       index_va = si_resource(indexbuf)->gpu_address + index_offset;
1532 
1533       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf),
1534                                 RADEON_USAGE_READ | RADEON_PRIO_INDEX_BUFFER);
1535    } else {
1536       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
1537        * so the state must be re-emitted before the next indexed draw.
1538        */
1539       if (GFX_VERSION >= GFX7)
1540          sctx->last_index_size = -1;
1541       if (GFX_VERSION == GFX10_3 && disable_instance_packing != sctx->disable_instance_packing) {
1542          radeon_set_uconfig_reg_idx(sctx->screen, GFX_VERSION,
1543                                     R_03090C_VGT_INDEX_TYPE, 2,
1544                                     S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing));
1545          sctx->disable_instance_packing = disable_instance_packing;
1546       }
1547    }
1548 
1549    unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
1550    bool render_cond_bit = sctx->render_cond_enabled;
1551 
1552    if (!IS_DRAW_VERTEX_STATE && indirect) {
1553       assert(num_draws == 1);
1554       uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
1555 
1556       assert(indirect_va % 8 == 0);
1557 
1558       si_invalidate_draw_constants(sctx);
1559 
1560       radeon_emit(PKT3(PKT3_SET_BASE, 2, 0));
1561       radeon_emit(1);
1562       radeon_emit(indirect_va);
1563       radeon_emit(indirect_va >> 32);
1564 
1565       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indirect->buffer),
1566                                 RADEON_USAGE_READ | RADEON_PRIO_DRAW_INDIRECT);
1567 
1568       unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
1569 
1570       assert(indirect->offset % 4 == 0);
1571 
1572       if (index_size) {
1573          radeon_emit(PKT3(PKT3_INDEX_BASE, 1, 0));
1574          radeon_emit(index_va);
1575          radeon_emit(index_va >> 32);
1576 
1577          radeon_emit(PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
1578          radeon_emit(index_max_size);
1579       }
1580 
1581       if (!sctx->screen->has_draw_indirect_multi) {
1582          radeon_emit(PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3,
1583                           render_cond_bit));
1584          radeon_emit(indirect->offset);
1585          radeon_emit((sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
1586          radeon_emit((sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
1587          radeon_emit(di_src_sel);
1588       } else {
1589          uint64_t count_va = 0;
1590 
1591          if (indirect->indirect_draw_count) {
1592             struct si_resource *params_buf = si_resource(indirect->indirect_draw_count);
1593 
1594             radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, params_buf,
1595                                       RADEON_USAGE_READ | RADEON_PRIO_DRAW_INDIRECT);
1596 
1597             count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
1598          }
1599 
1600          radeon_emit(PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
1601                           render_cond_bit));
1602          radeon_emit(indirect->offset);
1603          radeon_emit((sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
1604          radeon_emit((sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
1605          radeon_emit(((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
1606                      S_2C3_DRAW_INDEX_ENABLE(sctx->shader.vs.cso->info.uses_drawid) |
1607                      S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
1608          radeon_emit(indirect->draw_count);
1609          radeon_emit(count_va);
1610          radeon_emit(count_va >> 32);
1611          radeon_emit(indirect->stride);
1612          radeon_emit(di_src_sel);
1613       }
1614    } else {
1615       /* Register shadowing requires that we always emit PKT3_NUM_INSTANCES. */
1616       if (sctx->shadowed_regs ||
1617           sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
1618           sctx->last_instance_count != instance_count) {
1619          radeon_emit(PKT3(PKT3_NUM_INSTANCES, 0, 0));
1620          radeon_emit(instance_count);
1621          sctx->last_instance_count = instance_count;
1622       }
1623 
1624       /* Base vertex and start instance. */
1625       int base_vertex = index_size ? draws[0].index_bias : draws[0].start;
1626 
1627       bool set_draw_id = !IS_DRAW_VERTEX_STATE && sctx->vs_uses_draw_id;
1628       bool set_base_instance = sctx->vs_uses_base_instance;
1629 
1630       if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) {
1631          /* Re-emit draw constants after we leave u_blitter. */
1632          si_invalidate_draw_sh_constants(sctx);
1633 
1634          /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
1635          radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);
1636          radeon_emit_array(sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs);
1637       } else if (base_vertex != sctx->last_base_vertex ||
1638                  sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
1639                  (set_base_instance &&
1640                   (info->start_instance != sctx->last_start_instance ||
1641                    sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) ||
1642                  (set_draw_id &&
1643                   (drawid_base != sctx->last_drawid ||
1644                    sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) ||
1645                  sh_base_reg != sctx->last_sh_base_reg) {
1646          if (set_base_instance) {
1647             radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
1648             radeon_emit(base_vertex);
1649             radeon_emit(drawid_base);
1650             radeon_emit(info->start_instance);
1651 
1652             sctx->last_start_instance = info->start_instance;
1653             sctx->last_drawid = drawid_base;
1654          } else if (set_draw_id) {
1655             radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
1656             radeon_emit(base_vertex);
1657             radeon_emit(drawid_base);
1658 
1659             sctx->last_drawid = drawid_base;
1660          } else {
1661             radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex);
1662          }
1663 
1664          sctx->last_base_vertex = base_vertex;
1665          sctx->last_sh_base_reg = sh_base_reg;
1666       }
1667 
1668       /* Don't update draw_id in the following code if it doesn't increment. */
1669       bool increment_draw_id = !IS_DRAW_VERTEX_STATE && num_draws > 1 &&
1670                                set_draw_id && info->increment_draw_id;
1671 
1672       if (index_size) {
1673          /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
1674           * can be changed between draws, and GS fast launch must be disabled.
1675           * NOT_EOP doesn't work on gfx9 and older.
1676           *
1677           * Instead of doing this, which evaluates the case conditions repeatedly:
1678           *  for (all draws) {
1679           *    if (case1);
1680           *    else;
1681           *  }
1682           *
1683           * Use this structuring to evaluate the case conditions once:
1684           *  if (case1) for (all draws);
1685           *  else for (all draws);
1686           *
1687           */
1688          bool index_bias_varies = !IS_DRAW_VERTEX_STATE && num_draws > 1 &&
1689                                   info->index_bias_varies;
1690 
1691          if (increment_draw_id) {
1692             if (index_bias_varies) {
1693                for (unsigned i = 0; i < num_draws; i++) {
1694                   uint64_t va = index_va + draws[i].start * index_size;
1695 
1696                   if (i > 0) {
1697                      radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
1698                      radeon_emit(draws[i].index_bias);
1699                      radeon_emit(drawid_base + i);
1700                   }
1701 
1702                   radeon_emit(PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
1703                   radeon_emit(index_max_size);
1704                   radeon_emit(va);
1705                   radeon_emit(va >> 32);
1706                   radeon_emit(draws[i].count);
1707                   radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
1708                }
1709                if (num_draws > 1) {
1710                   sctx->last_base_vertex = draws[num_draws - 1].index_bias;
1711                   sctx->last_drawid = drawid_base + num_draws - 1;
1712                }
1713             } else {
1714                /* Only DrawID varies. */
1715                for (unsigned i = 0; i < num_draws; i++) {
1716                   uint64_t va = index_va + draws[i].start * index_size;
1717 
1718                   if (i > 0)
1719                      radeon_set_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, drawid_base + i);
1720 
1721                   radeon_emit(PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
1722                   radeon_emit(index_max_size);
1723                   radeon_emit(va);
1724                   radeon_emit(va >> 32);
1725                   radeon_emit(draws[i].count);
1726                   radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
1727                }
1728                if (num_draws > 1)
1729                   sctx->last_drawid = drawid_base + num_draws - 1;
1730             }
1731          } else {
1732             if (index_bias_varies) {
1733                /* Only BaseVertex varies. */
1734                for (unsigned i = 0; i < num_draws; i++) {
1735                   uint64_t va = index_va + draws[i].start * index_size;
1736 
1737                   if (i > 0)
1738                      radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, draws[i].index_bias);
1739 
1740                   radeon_emit(PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
1741                   radeon_emit(index_max_size);
1742                   radeon_emit(va);
1743                   radeon_emit(va >> 32);
1744                   radeon_emit(draws[i].count);
1745                   radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
1746                }
1747                if (num_draws > 1)
1748                   sctx->last_base_vertex = draws[num_draws - 1].index_bias;
1749             } else {
1750                /* DrawID and BaseVertex are constant. */
1751                if (GFX_VERSION == GFX10) {
1752                   /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
1753                    * count == 0 in the last draw (which doesn't set NOT_EOP).
1754                    *
1755                    * So remove all trailing draws with count == 0.
1756                    */
1757                   while (num_draws > 1 && !draws[num_draws - 1].count)
1758                      num_draws--;
1759                }
1760 
1761                for (unsigned i = 0; i < num_draws; i++) {
1762                   uint64_t va = index_va + draws[i].start * index_size;
1763 
1764                   radeon_emit(PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
1765                   radeon_emit(index_max_size);
1766                   radeon_emit(va);
1767                   radeon_emit(va >> 32);
1768                   radeon_emit(draws[i].count);
1769                   radeon_emit(V_0287F0_DI_SRC_SEL_DMA |
1770                               S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && i < num_draws - 1));
1771                }
1772             }
1773          }
1774       } else {
1775          for (unsigned i = 0; i < num_draws; i++) {
1776             if (i > 0) {
1777                if (increment_draw_id) {
1778                   unsigned draw_id = drawid_base + i;
1779 
1780                   radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
1781                   radeon_emit(draws[i].start);
1782                   radeon_emit(draw_id);
1783 
1784                   sctx->last_drawid = draw_id;
1785                } else {
1786                   radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, draws[i].start);
1787                }
1788             }
1789 
1790             radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
1791             radeon_emit(draws[i].count);
1792             radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
1793          }
1794          if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs))
1795             sctx->last_base_vertex = draws[num_draws - 1].start;
1796       }
1797    }
1798    radeon_end();
1799 
1800    EMIT_SQTT_END_DRAW;
1801 }
1802 
1803 /* Return false if not bound. */
1804 template<amd_gfx_level GFX_VERSION>
si_set_vb_descriptor(struct si_vertex_elements * velems,struct pipe_vertex_buffer * vb,unsigned index,uint32_t * desc)1805 static bool ALWAYS_INLINE si_set_vb_descriptor(struct si_vertex_elements *velems,
1806                                                struct pipe_vertex_buffer *vb,
1807                                                unsigned index, /* vertex element index */
1808                                                uint32_t *desc) /* where to upload descriptors */
1809 {
1810    struct si_resource *buf = si_resource(vb->buffer.resource);
1811    if (!buf) {
1812       memset(desc, 0, 16);
1813       return false;
1814    }
1815 
1816    int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[index];
1817 
1818    if (offset >= buf->b.b.width0) {
1819       assert(offset < buf->b.b.width0);
1820       memset(desc, 0, 16);
1821       return false;
1822    }
1823 
1824    uint64_t va = buf->gpu_address + offset;
1825 
1826    int64_t num_records = (int64_t)buf->b.b.width0 - offset;
1827    if (GFX_VERSION != GFX8 && vb->stride) {
1828       /* Round up by rounding down and adding 1 */
1829       num_records = (num_records - velems->format_size[index]) / vb->stride + 1;
1830    }
1831    assert(num_records >= 0 && num_records <= UINT_MAX);
1832 
1833    uint32_t rsrc_word3 = velems->rsrc_word3[index];
1834 
1835    /* OOB_SELECT chooses the out-of-bounds check:
1836     *  - 1: index >= NUM_RECORDS (Structured)
1837     *  - 3: offset >= NUM_RECORDS (Raw)
1838     */
1839    if (GFX_VERSION >= GFX10)
1840       rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
1841                                                    : V_008F0C_OOB_SELECT_RAW);
1842 
1843    desc[0] = va;
1844    desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);
1845    desc[2] = num_records;
1846    desc[3] = rsrc_word3;
1847    return true;
1848 }
1849 
1850 #if GFX_VER == 6 /* declare this function only once because it supports all chips. */
1851 
si_set_vertex_buffer_descriptor(struct si_screen * sscreen,struct si_vertex_elements * velems,struct pipe_vertex_buffer * vb,unsigned element_index,uint32_t * out)1852 void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems,
1853                                      struct pipe_vertex_buffer *vb, unsigned element_index,
1854                                      uint32_t *out)
1855 {
1856    switch (sscreen->info.gfx_level) {
1857    case GFX6:
1858       si_set_vb_descriptor<GFX6>(velems, vb, element_index, out);
1859       break;
1860    case GFX7:
1861       si_set_vb_descriptor<GFX7>(velems, vb, element_index, out);
1862       break;
1863    case GFX8:
1864       si_set_vb_descriptor<GFX8>(velems, vb, element_index, out);
1865       break;
1866    case GFX9:
1867       si_set_vb_descriptor<GFX9>(velems, vb, element_index, out);
1868       break;
1869    case GFX10:
1870       si_set_vb_descriptor<GFX10>(velems, vb, element_index, out);
1871       break;
1872    case GFX10_3:
1873       si_set_vb_descriptor<GFX10_3>(velems, vb, element_index, out);
1874       break;
1875    case GFX11:
1876       si_set_vb_descriptor<GFX11>(velems, vb, element_index, out);
1877       break;
1878    default:
1879       unreachable("unhandled gfx level");
1880    }
1881 }
1882 
1883 #endif
1884 
1885 template<util_popcnt POPCNT>
get_next_vertex_state_elem(struct pipe_vertex_state * state,uint32_t * partial_velem_mask)1886 static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_state *state,
1887                                                          uint32_t *partial_velem_mask)
1888 {
1889    unsigned semantic_index = u_bit_scan(partial_velem_mask);
1890    assert(state->input.full_velem_mask & BITFIELD_BIT(semantic_index));
1891    /* A prefix mask of the full mask gives us the index in pipe_vertex_state. */
1892    return util_bitcount_fast<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
1893 }
1894 
1895 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
1896           si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
si_upload_and_prefetch_VB_descriptors(struct si_context * sctx,struct pipe_vertex_state * state,uint32_t partial_velem_mask)1897 static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
1898                                                   struct pipe_vertex_state *state,
1899                                                   uint32_t partial_velem_mask)
1900 {
1901    struct si_vertex_state *vstate = (struct si_vertex_state *)state;
1902    unsigned count = IS_DRAW_VERTEX_STATE ? util_bitcount_fast<POPCNT>(partial_velem_mask) :
1903                                            sctx->num_vertex_elements;
1904    unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
1905                                             PIPE_SHADER_VERTEX);
1906    unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs_inline(GFX_VERSION);
1907    bool pointer_dirty, user_sgprs_dirty;
1908 
1909    assert(count <= SI_MAX_ATTRIBS);
1910 
1911    if (sctx->vertex_buffers_dirty || IS_DRAW_VERTEX_STATE) {
1912       assert(count);
1913 
1914       struct si_vertex_elements *velems = sctx->vertex_elements;
1915       unsigned alloc_size = IS_DRAW_VERTEX_STATE ?
1916                                vstate->velems.vb_desc_list_alloc_size :
1917                                velems->vb_desc_list_alloc_size;
1918       uint32_t *ptr;
1919 
1920       if (alloc_size) {
1921          /* Vertex buffer descriptors are the only ones which are uploaded directly
1922           * and don't go through si_upload_graphics_shader_descriptors.
1923           */
1924          u_upload_alloc(sctx->b.const_uploader, 0, alloc_size,
1925                         si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset,
1926                         (struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr);
1927          if (!sctx->vb_descriptors_buffer) {
1928             sctx->vb_descriptors_offset = 0;
1929             sctx->vb_descriptors_gpu_list = NULL;
1930             return false;
1931          }
1932 
1933          sctx->vb_descriptors_gpu_list = ptr;
1934          radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer,
1935                                    RADEON_USAGE_READ | RADEON_PRIO_DESCRIPTORS);
1936          /* GFX6 doesn't support the L2 prefetch. */
1937          if (GFX_VERSION >= GFX7)
1938             si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
1939                                alloc_size);
1940       } else {
1941          si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
1942       }
1943 
1944       if (IS_DRAW_VERTEX_STATE) {
1945          unsigned i = 0;
1946 
1947          if (num_vbos_in_user_sgprs) {
1948             unsigned num_vb_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;
1949 
1950             radeon_begin(&sctx->gfx_cs);
1951             radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_vb_sgprs);
1952 
1953             for (; partial_velem_mask && i < num_vbos_in_user_sgprs; i++) {
1954                unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
1955 
1956                radeon_emit_array(&vstate->descriptors[velem_index * 4], 4);
1957             }
1958             radeon_end();
1959          }
1960 
1961          for (; partial_velem_mask; i++) {
1962             unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
1963             uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
1964 
1965             memcpy(desc, &vstate->descriptors[velem_index * 4], 16);
1966          }
1967 
1968          if (vstate->b.input.vbuffer.buffer.resource != vstate->b.input.indexbuf) {
1969             radeon_add_to_buffer_list(sctx, &sctx->gfx_cs,
1970                                       si_resource(vstate->b.input.vbuffer.buffer.resource),
1971                                       RADEON_USAGE_READ | RADEON_PRIO_VERTEX_BUFFER);
1972          }
1973 
1974          /* The next draw_vbo should recompute and rebind vertex buffer descriptors. */
1975          sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
1976 
1977          user_sgprs_dirty = false; /* We just set them above. */
1978          pointer_dirty = count > num_vbos_in_user_sgprs;
1979       } else {
1980          unsigned first_vb_use_mask = velems->first_vb_use_mask;
1981 
1982          for (unsigned i = 0; i < count; i++) {
1983             unsigned vbo_index = velems->vertex_buffer_index[i];
1984             struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
1985             uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
1986                                                         : &ptr[(i - num_vbos_in_user_sgprs) * 4];
1987 
1988             if (!si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc))
1989                continue;
1990 
1991             if (first_vb_use_mask & (1 << i)) {
1992                radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(vb->buffer.resource),
1993                                          RADEON_USAGE_READ | RADEON_PRIO_VERTEX_BUFFER);
1994             }
1995          }
1996 
1997          sctx->vertex_buffers_dirty = false;
1998          user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
1999          pointer_dirty = alloc_size != 0;
2000       }
2001    } else {
2002       pointer_dirty = sctx->vertex_buffer_pointer_dirty;
2003       user_sgprs_dirty = sctx->vertex_buffer_user_sgprs_dirty;
2004    }
2005 
2006    if (pointer_dirty || user_sgprs_dirty) {
2007       struct radeon_cmdbuf *cs = &sctx->gfx_cs;
2008       assert(count);
2009 
2010       radeon_begin(cs);
2011 
2012       /* Set the pointer to vertex buffer descriptors. */
2013       if (pointer_dirty && count > num_vbos_in_user_sgprs) {
2014          /* Find the location of the VB descriptor pointer. */
2015          unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
2016          if (GFX_VERSION >= GFX9) {
2017             if (HAS_TESS)
2018                sh_dw_offset = GFX9_TCS_NUM_USER_SGPR;
2019             else if (HAS_GS || NGG)
2020                sh_dw_offset = GFX9_GS_NUM_USER_SGPR;
2021          }
2022 
2023          radeon_set_sh_reg(sh_base + sh_dw_offset * 4,
2024                            sctx->vb_descriptors_buffer->gpu_address +
2025                            sctx->vb_descriptors_offset);
2026          sctx->vertex_buffer_pointer_dirty = false;
2027       }
2028 
2029       /* Set VB descriptors in user SGPRs. */
2030       if (user_sgprs_dirty) {
2031          assert(num_vbos_in_user_sgprs);
2032 
2033          unsigned num_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;
2034 
2035          radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_sgprs);
2036          radeon_emit_array(sctx->vb_descriptor_user_sgprs, num_sgprs);
2037          sctx->vertex_buffer_user_sgprs_dirty = false;
2038       }
2039       radeon_end();
2040    }
2041 
2042    return true;
2043 }
2044 
si_get_draw_start_count(struct si_context * sctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws,unsigned * start,unsigned * count)2045 static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info,
2046                                     const struct pipe_draw_indirect_info *indirect,
2047                                     const struct pipe_draw_start_count_bias *draws,
2048                                     unsigned num_draws, unsigned *start, unsigned *count)
2049 {
2050    if (indirect && !indirect->count_from_stream_output) {
2051       unsigned indirect_count;
2052       struct pipe_transfer *transfer;
2053       unsigned begin, end;
2054       unsigned map_size;
2055       unsigned *data;
2056 
2057       if (indirect->indirect_draw_count) {
2058          data = (unsigned*)
2059                 pipe_buffer_map_range(&sctx->b, indirect->indirect_draw_count,
2060                                       indirect->indirect_draw_count_offset, sizeof(unsigned),
2061                                       PIPE_MAP_READ, &transfer);
2062 
2063          indirect_count = *data;
2064 
2065          pipe_buffer_unmap(&sctx->b, transfer);
2066       } else {
2067          indirect_count = indirect->draw_count;
2068       }
2069 
2070       if (!indirect_count) {
2071          *start = *count = 0;
2072          return;
2073       }
2074 
2075       map_size = (indirect_count - 1) * indirect->stride + 3 * sizeof(unsigned);
2076       data = (unsigned*)
2077              pipe_buffer_map_range(&sctx->b, indirect->buffer, indirect->offset, map_size,
2078                                    PIPE_MAP_READ, &transfer);
2079 
2080       begin = UINT_MAX;
2081       end = 0;
2082 
2083       for (unsigned i = 0; i < indirect_count; ++i) {
2084          unsigned count = data[0];
2085          unsigned start = data[2];
2086 
2087          if (count > 0) {
2088             begin = MIN2(begin, start);
2089             end = MAX2(end, start + count);
2090          }
2091 
2092          data += indirect->stride / sizeof(unsigned);
2093       }
2094 
2095       pipe_buffer_unmap(&sctx->b, transfer);
2096 
2097       if (begin < end) {
2098          *start = begin;
2099          *count = end - begin;
2100       } else {
2101          *start = *count = 0;
2102       }
2103    } else {
2104       unsigned min_element = UINT_MAX;
2105       unsigned max_element = 0;
2106 
2107       for (unsigned i = 0; i < num_draws; i++) {
2108          min_element = MIN2(min_element, draws[i].start);
2109          max_element = MAX2(max_element, draws[i].start + draws[i].count);
2110       }
2111 
2112       *start = min_element;
2113       *count = max_element - min_element;
2114    }
2115 }
2116 
2117 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
2118           si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
si_emit_all_states(struct si_context * sctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,enum pipe_prim_type prim,unsigned instance_count,unsigned min_vertex_count,bool primitive_restart,unsigned skip_atom_mask)2119 static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
2120                                const struct pipe_draw_indirect_info *indirect,
2121                                enum pipe_prim_type prim, unsigned instance_count,
2122                                unsigned min_vertex_count, bool primitive_restart,
2123                                unsigned skip_atom_mask)
2124 {
2125    unsigned num_patches = 0;
2126 
2127    si_emit_rasterizer_prim_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx);
2128    if (HAS_TESS)
2129       si_emit_derived_tess_state(sctx, &num_patches);
2130 
2131    /* Emit state atoms. */
2132    unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
2133    if (mask) {
2134       do {
2135          sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
2136       } while (mask);
2137 
2138       sctx->dirty_atoms &= skip_atom_mask;
2139    }
2140 
2141    /* Emit states. */
2142    mask = sctx->dirty_states;
2143    if (mask) {
2144       do {
2145          unsigned i = u_bit_scan(&mask);
2146          struct si_pm4_state *state = sctx->queued.array[i];
2147 
2148          /* All places should unset dirty_states if this doesn't pass. */
2149          assert(state && state != sctx->emitted.array[i]);
2150 
2151          si_pm4_emit(sctx, state);
2152          sctx->emitted.array[i] = state;
2153       } while (mask);
2154 
2155       sctx->dirty_states = 0;
2156    }
2157 
2158    /* Emit draw states. */
2159    si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>(sctx, info->index_size);
2160    si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
2161          (sctx, indirect, prim, num_patches, instance_count, primitive_restart,
2162           info->restart_index, min_vertex_count);
2163 }
2164 
2165 #define DRAW_CLEANUP do {                                 \
2166       if (index_size && indexbuf != info->index.resource) \
2167          pipe_resource_reference(&indexbuf, NULL);        \
2168    } while (0)
2169 
2170 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
2171           si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, util_popcnt POPCNT> ALWAYS_INLINE
si_draw(struct pipe_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws,struct pipe_vertex_state * state,uint32_t partial_velem_mask)2172 static void si_draw(struct pipe_context *ctx,
2173                     const struct pipe_draw_info *info,
2174                     unsigned drawid_offset,
2175                     const struct pipe_draw_indirect_info *indirect,
2176                     const struct pipe_draw_start_count_bias *draws,
2177                     unsigned num_draws,
2178                     struct pipe_vertex_state *state,
2179                     uint32_t partial_velem_mask)
2180 {
2181    /* Keep code that uses the least number of local variables as close to the beginning
2182     * of this function as possible to minimize register pressure.
2183     *
2184     * It doesn't matter where we return due to invalid parameters because such cases
2185     * shouldn't occur in practice.
2186     */
2187    struct si_context *sctx = (struct si_context *)ctx;
2188 
2189    si_check_dirty_buffers_textures(sctx);
2190 
2191    si_decompress_textures(sctx, u_bit_consecutive(0, SI_NUM_GRAPHICS_SHADERS));
2192    si_need_gfx_cs_space(sctx, num_draws);
2193 
2194    if (HAS_TESS) {
2195       if (sctx->is_user_tcs) {
2196          struct si_shader_selector *tcs = sctx->shader.tcs.cso;
2197 
2198          bool same_patch_vertices =
2199             GFX_VERSION >= GFX9 &&
2200             sctx->patch_vertices == tcs->info.base.tess.tcs_vertices_out;
2201 
2202          if (sctx->shader.tcs.key.ge.opt.same_patch_vertices != same_patch_vertices) {
2203             sctx->shader.tcs.key.ge.opt.same_patch_vertices = same_patch_vertices;
2204             sctx->do_update_shaders = true;
2205          }
2206 
2207          if (GFX_VERSION == GFX9 && sctx->screen->info.has_ls_vgpr_init_bug) {
2208             /* Determine whether the LS VGPR fix should be applied.
2209              *
2210              * It is only required when num input CPs > num output CPs,
2211              * which cannot happen with the fixed function TCS.
2212              */
2213             bool ls_vgpr_fix =
2214                sctx->patch_vertices > tcs->info.base.tess.tcs_vertices_out;
2215 
2216             if (ls_vgpr_fix != sctx->shader.tcs.key.ge.part.tcs.ls_prolog.ls_vgpr_fix) {
2217                sctx->shader.tcs.key.ge.part.tcs.ls_prolog.ls_vgpr_fix = ls_vgpr_fix;
2218                sctx->do_update_shaders = true;
2219             }
2220          }
2221       } else {
2222          /* These fields are static for fixed function TCS. So no need to set
2223           * do_update_shaders between fixed-TCS draws. As fixed-TCS to user-TCS
2224           * or opposite, do_update_shaders should already be set by bind state.
2225           */
2226          sctx->shader.tcs.key.ge.opt.same_patch_vertices = GFX_VERSION >= GFX9;
2227          sctx->shader.tcs.key.ge.part.tcs.ls_prolog.ls_vgpr_fix = false;
2228 
2229          /* User may only change patch vertices, needs to update fixed func TCS. */
2230          if (sctx->shader.tcs.cso &&
2231              sctx->shader.tcs.cso->info.base.tess.tcs_vertices_out != sctx->patch_vertices)
2232             sctx->do_update_shaders = true;
2233       }
2234    }
2235 
2236    enum pipe_prim_type prim = (enum pipe_prim_type)info->mode;
2237    unsigned instance_count = info->instance_count;
2238 
2239    /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
2240     * no workaround for indirect draws, but we can at least skip
2241     * direct draws.
2242     * 'instance_count == 0' seems to be problematic on Renoir chips (#4866),
2243     * so simplify the condition and drop these draws for all <= GFX9 chips.
2244     */
2245    if (GFX_VERSION <= GFX9 && unlikely(!IS_DRAW_VERTEX_STATE && !indirect && !instance_count))
2246       return;
2247 
2248    struct si_shader_selector *vs = sctx->shader.vs.cso;
2249    struct si_vertex_state *vstate = (struct si_vertex_state *)state;
2250    if (unlikely(!vs ||
2251                 (!IS_DRAW_VERTEX_STATE && sctx->num_vertex_elements < vs->info.num_vs_inputs) ||
2252                 (IS_DRAW_VERTEX_STATE && vstate->velems.count < vs->info.num_vs_inputs) ||
2253                 !sctx->shader.ps.cso || (HAS_TESS != (prim == PIPE_PRIM_PATCHES)))) {
2254       assert(0);
2255       return;
2256    }
2257 
2258    if (GFX_VERSION <= GFX9 && HAS_GS) {
2259       /* Determine whether the GS triangle strip adjacency fix should
2260        * be applied. Rotate every other triangle if triangle strips with
2261        * adjacency are fed to the GS. This doesn't work if primitive
2262        * restart occurs after an odd number of triangles.
2263        */
2264       bool gs_tri_strip_adj_fix =
2265          !HAS_TESS && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
2266 
2267       if (gs_tri_strip_adj_fix != sctx->shader.gs.key.ge.mono.u.gs_tri_strip_adj_fix) {
2268          sctx->shader.gs.key.ge.mono.u.gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
2269          sctx->do_update_shaders = true;
2270       }
2271    }
2272 
2273    struct pipe_resource *indexbuf = info->index.resource;
2274    unsigned index_size = info->index_size;
2275    unsigned index_offset = indirect && indirect->buffer ? draws[0].start * index_size : 0;
2276 
2277    if (index_size) {
2278       /* Translate or upload, if needed. */
2279       /* 8-bit indices are supported on GFX8. */
2280       if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) {
2281          unsigned start, count, start_offset, size, offset;
2282          void *ptr;
2283 
2284          si_get_draw_start_count(sctx, info, indirect, draws, num_draws, &start, &count);
2285          start_offset = start * 2;
2286          size = count * 2;
2287 
2288          indexbuf = NULL;
2289          u_upload_alloc(ctx->stream_uploader, start_offset, size,
2290                         si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr);
2291          if (unlikely(!indexbuf))
2292             return;
2293 
2294          util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr);
2295 
2296          /* info->start will be added by the drawing code */
2297          index_offset = offset - start_offset;
2298          index_size = 2;
2299       } else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) {
2300          unsigned start_offset;
2301 
2302          assert(!indirect);
2303          assert(num_draws == 1);
2304          start_offset = draws[0].start * index_size;
2305 
2306          indexbuf = NULL;
2307          u_upload_data(ctx->stream_uploader, start_offset, draws[0].count * index_size,
2308                        sctx->screen->info.tcc_cache_line_size,
2309                        (char *)info->index.user + start_offset, &index_offset, &indexbuf);
2310          if (unlikely(!indexbuf))
2311             return;
2312 
2313          /* info->start will be added by the drawing code */
2314          index_offset -= start_offset;
2315       } else if (GFX_VERSION <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
2316          /* GFX8 reads index buffers through TC L2, so it doesn't
2317           * need this. */
2318          sctx->flags |= SI_CONTEXT_WB_L2;
2319          si_resource(indexbuf)->TC_L2_dirty = false;
2320       }
2321    }
2322 
2323    unsigned min_direct_count = 0;
2324    unsigned total_direct_count = 0;
2325 
2326    if (!IS_DRAW_VERTEX_STATE && indirect) {
2327       /* Add the buffer size for memory checking in need_cs_space. */
2328       if (indirect->buffer)
2329          si_context_add_resource_size(sctx, indirect->buffer);
2330 
2331       /* Indirect buffers use TC L2 on GFX9, but not older hw. */
2332       if (GFX_VERSION <= GFX8) {
2333          if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
2334             sctx->flags |= SI_CONTEXT_WB_L2;
2335             si_resource(indirect->buffer)->TC_L2_dirty = false;
2336          }
2337 
2338          if (indirect->indirect_draw_count &&
2339              si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
2340             sctx->flags |= SI_CONTEXT_WB_L2;
2341             si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
2342          }
2343       }
2344       total_direct_count = INT_MAX; /* just set something other than 0 to enable shader culling */
2345    } else {
2346       total_direct_count = min_direct_count = draws[0].count;
2347 
2348       for (unsigned i = 1; i < num_draws; i++) {
2349          unsigned count = draws[i].count;
2350 
2351          total_direct_count += count;
2352          min_direct_count = MIN2(min_direct_count, count);
2353       }
2354    }
2355 
2356    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
2357    bool primitive_restart =
2358       info->primitive_restart &&
2359       (!sctx->screen->options.prim_restart_tri_strips_only ||
2360        (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
2361 
2362    /* Set the rasterization primitive type.
2363     *
2364     * This must be done after si_decompress_textures, which can call
2365     * draw_vbo recursively, and before si_update_shaders, which uses
2366     * current_rast_prim for this draw_vbo call.
2367     */
2368    if (!HAS_GS && !HAS_TESS) {
2369       enum pipe_prim_type rast_prim;
2370 
2371       if (util_rast_prim_is_triangles(prim)) {
2372          rast_prim = PIPE_PRIM_TRIANGLES;
2373       } else {
2374          /* Only possibilities, POINTS, LINE*, RECTANGLES */
2375          rast_prim = prim;
2376       }
2377 
2378       if (rast_prim != sctx->current_rast_prim) {
2379          if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
2380              util_prim_is_points_or_lines(rast_prim))
2381             si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
2382 
2383          sctx->current_rast_prim = rast_prim;
2384          sctx->do_update_shaders = true;
2385       }
2386    }
2387 
2388    if (IS_DRAW_VERTEX_STATE) {
2389       /* draw_vertex_state doesn't use the current vertex buffers and vertex elements,
2390        * so disable any non-trivial VS prolog that is based on them, such as vertex
2391        * format lowering.
2392        */
2393       if (!sctx->force_trivial_vs_prolog) {
2394          sctx->force_trivial_vs_prolog = true;
2395 
2396          /* Update shaders to disable the non-trivial VS prolog. */
2397          if (sctx->uses_nontrivial_vs_prolog) {
2398             si_vs_key_update_inputs(sctx);
2399             sctx->do_update_shaders = true;
2400          }
2401       }
2402    } else {
2403       if (sctx->force_trivial_vs_prolog) {
2404          sctx->force_trivial_vs_prolog = false;
2405 
2406          /* Update shaders to enable the non-trivial VS prolog. */
2407          if (sctx->uses_nontrivial_vs_prolog) {
2408             si_vs_key_update_inputs(sctx);
2409             sctx->do_update_shaders = true;
2410          }
2411       }
2412    }
2413 
2414    /* Update NGG culling settings. */
2415    uint16_t old_ngg_culling = sctx->ngg_culling;
2416    if (GFX_VERSION >= GFX10) {
2417       struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;
2418 
2419       if (NGG &&
2420           /* Tessellation and GS set ngg_cull_vert_threshold to UINT_MAX if the prim type
2421            * is not points, so this check is only needed for VS. */
2422           (HAS_TESS || HAS_GS || util_rast_prim_is_lines_or_triangles(sctx->current_rast_prim)) &&
2423           /* Only the first draw for a shader starts with culling disabled and it's disabled
2424            * until we pass the total_direct_count check and then it stays enabled until
2425            * the shader is changed. This eliminates most culling on/off state changes. */
2426           (old_ngg_culling || total_direct_count > hw_vs->ngg_cull_vert_threshold)) {
2427          /* Check that the current shader allows culling. */
2428          assert(hw_vs->ngg_cull_vert_threshold != UINT_MAX);
2429 
2430          uint16_t ngg_culling;
2431 
2432          if (util_prim_is_lines(sctx->current_rast_prim)) {
2433             /* Overwrite it to mask out face cull flags. */
2434             ngg_culling = rs->ngg_cull_flags_lines;
2435          } else {
2436             ngg_culling = sctx->viewport0_y_inverted ? rs->ngg_cull_flags_tris_y_inverted :
2437                                                        rs->ngg_cull_flags_tris;
2438             assert(ngg_culling); /* rasterizer state should always set this to non-zero */
2439          }
2440 
2441          if (ngg_culling != old_ngg_culling) {
2442             /* If shader compilation is not ready, this setting will be rejected. */
2443             sctx->ngg_culling = ngg_culling;
2444             sctx->do_update_shaders = true;
2445          }
2446       } else if (old_ngg_culling) {
2447          sctx->ngg_culling = 0;
2448          sctx->do_update_shaders = true;
2449       }
2450    }
2451 
2452    if (unlikely(sctx->do_update_shaders)) {
2453       if (unlikely(!(si_update_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
2454          DRAW_CLEANUP;
2455          return;
2456       }
2457 
2458       /* si_update_shaders can clear the ngg_culling in the shader key if the shader compilation
2459        * hasn't finished. Set it to the correct value in si_context.
2460        */
2461       if (GFX_VERSION >= GFX10 && NGG)
2462          sctx->ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.ge.opt.ngg_culling;
2463    }
2464 
2465    /* Since we've called si_context_add_resource_size for vertex buffers,
2466     * this must be called after si_need_cs_space, because we must let
2467     * need_cs_space flush before we add buffers to the buffer list.
2468     *
2469     * This must be done after si_update_shaders because si_update_shaders can
2470     * flush the CS when enabling tess and GS rings.
2471     */
2472    if (sctx->bo_list_add_all_gfx_resources)
2473       si_gfx_resources_add_all_to_bo_list(sctx);
2474 
2475    /* Graphics shader descriptors must be uploaded after si_update_shaders because
2476     * it binds tess and GS ring buffers.
2477     */
2478    if (unlikely(!si_upload_graphics_shader_descriptors(sctx))) {
2479       DRAW_CLEANUP;
2480       return;
2481    }
2482 
2483    /* Vega10/Raven scissor bug workaround. When any context register is
2484     * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
2485     * registers must be written too.
2486     */
2487    unsigned masked_atoms = 0;
2488    bool gfx9_scissor_bug = false;
2489 
2490    if (GFX_VERSION == GFX9 && sctx->screen->info.has_gfx9_scissor_bug) {
2491       masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
2492       gfx9_scissor_bug = true;
2493 
2494       if ((!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) ||
2495           sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
2496           sctx->dirty_states & si_states_that_always_roll_context())
2497          sctx->context_roll = true;
2498    }
2499 
2500    /* Use optimal packet order based on whether we need to sync the pipeline. */
2501    if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
2502                                SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
2503                                SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_VGT_FLUSH))) {
2504       /* If we have to wait for idle, set all states first, so that all
2505        * SET packets are processed in parallel with previous draw calls.
2506        * Then draw and prefetch at the end. This ensures that the time
2507        * the CUs are idle is very short.
2508        */
2509       if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
2510          masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
2511 
2512       /* Emit all states except possibly render condition. */
2513       si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
2514             (sctx, info, indirect, prim, instance_count, min_direct_count,
2515              primitive_restart, masked_atoms);
2516       sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
2517       /* <-- CUs are idle here. */
2518 
2519       /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
2520        * It should done after cache flushing.
2521        */
2522       if (unlikely((!si_upload_and_prefetch_VB_descriptors
2523                         <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
2524                         (sctx, state, partial_velem_mask)))) {
2525          DRAW_CLEANUP;
2526          return;
2527       }
2528 
2529       if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
2530          sctx->atoms.s.render_cond.emit(sctx);
2531          sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
2532       }
2533 
2534       if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
2535           (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
2536          sctx->atoms.s.scissors.emit(sctx);
2537          sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
2538       }
2539       assert(sctx->dirty_atoms == 0);
2540 
2541       si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
2542             (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf,
2543              index_size, index_offset, instance_count);
2544       /* <-- CUs are busy here. */
2545 
2546       /* Start prefetches after the draw has been started. Both will run
2547        * in parallel, but starting the draw first is more important.
2548        */
2549       si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_ALL>(sctx);
2550    } else {
2551       /* If we don't wait for idle, start prefetches first, then set
2552        * states, and draw at the end.
2553        */
2554       if (sctx->flags)
2555          sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
2556 
2557       /* Only prefetch the API VS and VBO descriptors. */
2558       si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_BEFORE_DRAW>(sctx);
2559 
2560       /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
2561        * It should done after cache flushing and after the VS prefetch.
2562        */
2563       if (unlikely((!si_upload_and_prefetch_VB_descriptors
2564                        <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
2565                        (sctx, state, partial_velem_mask)))) {
2566          DRAW_CLEANUP;
2567          return;
2568       }
2569 
2570       si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
2571             (sctx, info, indirect, prim, instance_count, min_direct_count,
2572              primitive_restart, masked_atoms);
2573 
2574       if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
2575           (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
2576          sctx->atoms.s.scissors.emit(sctx);
2577          sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
2578       }
2579       assert(sctx->dirty_atoms == 0);
2580 
2581       si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
2582             (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf,
2583              index_size, index_offset, instance_count);
2584 
2585       /* Prefetch the remaining shaders after the draw has been
2586        * started. */
2587       si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_AFTER_DRAW>(sctx);
2588    }
2589 
2590    /* Clear the context roll flag after the draw call.
2591     * Only used by the gfx9 scissor bug.
2592     */
2593    if (GFX_VERSION == GFX9)
2594       sctx->context_roll = false;
2595 
2596    if (unlikely(sctx->current_saved_cs)) {
2597       si_trace_emit(sctx);
2598       si_log_draw_state(sctx, sctx->log);
2599    }
2600 
2601    /* Workaround for a VGT hang when streamout is enabled.
2602     * It must be done after drawing. */
2603    if (((GFX_VERSION == GFX7 && sctx->family == CHIP_HAWAII) ||
2604         (GFX_VERSION == GFX8 && (sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI))) &&
2605        si_get_strmout_en(sctx)) {
2606       sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
2607    }
2608 
2609    if (unlikely(sctx->decompression_enabled)) {
2610       sctx->num_decompress_calls++;
2611    } else {
2612       sctx->num_draw_calls += num_draws;
2613       if (primitive_restart)
2614          sctx->num_prim_restart_calls += num_draws;
2615    }
2616 
2617    if (sctx->framebuffer.state.zsbuf) {
2618       struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
2619       zstex->depth_cleared_level_mask &= ~BITFIELD_BIT(sctx->framebuffer.state.zsbuf->u.tex.level);
2620    }
2621 
2622    DRAW_CLEANUP;
2623 }
2624 
2625 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
si_draw_vbo(struct pipe_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)2626 static void si_draw_vbo(struct pipe_context *ctx,
2627                         const struct pipe_draw_info *info,
2628                         unsigned drawid_offset,
2629                         const struct pipe_draw_indirect_info *indirect,
2630                         const struct pipe_draw_start_count_bias *draws,
2631                         unsigned num_draws)
2632 {
2633    si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_OFF, POPCNT_NO>
2634       (ctx, info, drawid_offset, indirect, draws, num_draws, NULL, 0);
2635 }
2636 
2637 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
2638           util_popcnt POPCNT>
si_draw_vertex_state(struct pipe_context * ctx,struct pipe_vertex_state * vstate,uint32_t partial_velem_mask,struct pipe_draw_vertex_state_info info,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)2639 static void si_draw_vertex_state(struct pipe_context *ctx,
2640                                  struct pipe_vertex_state *vstate,
2641                                  uint32_t partial_velem_mask,
2642                                  struct pipe_draw_vertex_state_info info,
2643                                  const struct pipe_draw_start_count_bias *draws,
2644                                  unsigned num_draws)
2645 {
2646    struct si_vertex_state *state = (struct si_vertex_state *)vstate;
2647    struct pipe_draw_info dinfo = {};
2648 
2649    dinfo.mode = info.mode;
2650    dinfo.index_size = 4;
2651    dinfo.instance_count = 1;
2652    dinfo.index.resource = state->b.input.indexbuf;
2653 
2654    si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_ON, POPCNT>
2655       (ctx, &dinfo, 0, NULL, draws, num_draws, vstate, partial_velem_mask);
2656 
2657    if (info.take_vertex_state_ownership)
2658       pipe_vertex_state_reference(&vstate, NULL);
2659 }
2660 
si_draw_rectangle(struct blitter_context * blitter,void * vertex_elements_cso,blitter_get_vs_func get_vs,int x1,int y1,int x2,int y2,float depth,unsigned num_instances,enum blitter_attrib_type type,const union blitter_attrib * attrib)2661 static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso,
2662                               blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2,
2663                               float depth, unsigned num_instances, enum blitter_attrib_type type,
2664                               const union blitter_attrib *attrib)
2665 {
2666    struct pipe_context *pipe = util_blitter_get_pipe(blitter);
2667    struct si_context *sctx = (struct si_context *)pipe;
2668 
2669    /* Pack position coordinates as signed int16. */
2670    sctx->vs_blit_sh_data[0] = (uint32_t)(x1 & 0xffff) | ((uint32_t)(y1 & 0xffff) << 16);
2671    sctx->vs_blit_sh_data[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(y2 & 0xffff) << 16);
2672    sctx->vs_blit_sh_data[2] = fui(depth);
2673 
2674    switch (type) {
2675    case UTIL_BLITTER_ATTRIB_COLOR:
2676       memcpy(&sctx->vs_blit_sh_data[3], attrib->color, sizeof(float) * 4);
2677       break;
2678    case UTIL_BLITTER_ATTRIB_TEXCOORD_XY:
2679    case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW:
2680       memcpy(&sctx->vs_blit_sh_data[3], &attrib->texcoord, sizeof(attrib->texcoord));
2681       break;
2682    case UTIL_BLITTER_ATTRIB_NONE:;
2683    }
2684 
2685    pipe->bind_vs_state(pipe, si_get_blitter_vs(sctx, type, num_instances));
2686 
2687    struct pipe_draw_info info = {};
2688    struct pipe_draw_start_count_bias draw;
2689 
2690    info.mode = SI_PRIM_RECTANGLE_LIST;
2691    info.instance_count = num_instances;
2692 
2693    draw.start = 0;
2694    draw.count = 3;
2695 
2696    /* Don't set per-stage shader pointers for VS. */
2697    sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(VERTEX);
2698    sctx->vertex_buffer_pointer_dirty = false;
2699    sctx->vertex_buffer_user_sgprs_dirty = false;
2700 
2701    pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
2702 }
2703 
2704 template <amd_gfx_level GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
si_init_draw_vbo(struct si_context * sctx)2705 static void si_init_draw_vbo(struct si_context *sctx)
2706 {
2707    if (NGG && GFX_VERSION < GFX10)
2708       return;
2709 
2710    if (!NGG && GFX_VERSION >= GFX11)
2711       return;
2712 
2713    sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
2714       si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG>;
2715 
2716    if (util_get_cpu_caps()->has_popcnt) {
2717       sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
2718          si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, POPCNT_YES>;
2719    } else {
2720       sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
2721          si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, POPCNT_NO>;
2722    }
2723 }
2724 
2725 template <amd_gfx_level GFX_VERSION>
si_init_draw_vbo_all_pipeline_options(struct si_context * sctx)2726 static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
2727 {
2728    si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF>(sctx);
2729    si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON,  NGG_OFF>(sctx);
2730    si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_OFF, NGG_OFF>(sctx);
2731    si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_ON,  NGG_OFF>(sctx);
2732    si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON>(sctx);
2733    si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON,  NGG_ON>(sctx);
2734    si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_OFF, NGG_ON>(sctx);
2735    si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_ON,  NGG_ON>(sctx);
2736 }
2737 
si_invalid_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)2738 static void si_invalid_draw_vbo(struct pipe_context *pipe,
2739                                 const struct pipe_draw_info *info,
2740                                 unsigned drawid_offset,
2741                                 const struct pipe_draw_indirect_info *indirect,
2742                                 const struct pipe_draw_start_count_bias *draws,
2743                                 unsigned num_draws)
2744 {
2745    unreachable("vertex shader not bound");
2746 }
2747 
si_invalid_draw_vertex_state(struct pipe_context * ctx,struct pipe_vertex_state * vstate,uint32_t partial_velem_mask,struct pipe_draw_vertex_state_info info,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)2748 static void si_invalid_draw_vertex_state(struct pipe_context *ctx,
2749                                          struct pipe_vertex_state *vstate,
2750                                          uint32_t partial_velem_mask,
2751                                          struct pipe_draw_vertex_state_info info,
2752                                          const struct pipe_draw_start_count_bias *draws,
2753                                          unsigned num_draws)
2754 {
2755    unreachable("vertex shader not bound");
2756 }
2757 
2758 extern "C"
GFX(si_init_draw_functions_)2759 void GFX(si_init_draw_functions_)(struct si_context *sctx)
2760 {
2761    assert(sctx->gfx_level == GFX());
2762 
2763    si_init_draw_vbo_all_pipeline_options<GFX()>(sctx);
2764 
2765    /* Bind a fake draw_vbo, so that draw_vbo isn't NULL, which would skip
2766     * initialization of callbacks in upper layers (such as u_threaded_context).
2767     */
2768    sctx->b.draw_vbo = si_invalid_draw_vbo;
2769    sctx->b.draw_vertex_state = si_invalid_draw_vertex_state;
2770    sctx->blitter->draw_rectangle = si_draw_rectangle;
2771 
2772    si_init_ia_multi_vgt_param_table(sctx);
2773 }
2774 
2775 #if GFX_VER == 6 /* declare this function only once because it supports all chips. */
2776 
2777 extern "C"
si_init_spi_map_functions(struct si_context * sctx)2778 void si_init_spi_map_functions(struct si_context *sctx)
2779 {
2780    /* This unrolls the loops in si_emit_spi_map and inlines memcmp and memcpys.
2781     * It improves performance for viewperf/snx.
2782     */
2783    sctx->emit_spi_map[0] = si_emit_spi_map<0>;
2784    sctx->emit_spi_map[1] = si_emit_spi_map<1>;
2785    sctx->emit_spi_map[2] = si_emit_spi_map<2>;
2786    sctx->emit_spi_map[3] = si_emit_spi_map<3>;
2787    sctx->emit_spi_map[4] = si_emit_spi_map<4>;
2788    sctx->emit_spi_map[5] = si_emit_spi_map<5>;
2789    sctx->emit_spi_map[6] = si_emit_spi_map<6>;
2790    sctx->emit_spi_map[7] = si_emit_spi_map<7>;
2791    sctx->emit_spi_map[8] = si_emit_spi_map<8>;
2792    sctx->emit_spi_map[9] = si_emit_spi_map<9>;
2793    sctx->emit_spi_map[10] = si_emit_spi_map<10>;
2794    sctx->emit_spi_map[11] = si_emit_spi_map<11>;
2795    sctx->emit_spi_map[12] = si_emit_spi_map<12>;
2796    sctx->emit_spi_map[13] = si_emit_spi_map<13>;
2797    sctx->emit_spi_map[14] = si_emit_spi_map<14>;
2798    sctx->emit_spi_map[15] = si_emit_spi_map<15>;
2799    sctx->emit_spi_map[16] = si_emit_spi_map<16>;
2800    sctx->emit_spi_map[17] = si_emit_spi_map<17>;
2801    sctx->emit_spi_map[18] = si_emit_spi_map<18>;
2802    sctx->emit_spi_map[19] = si_emit_spi_map<19>;
2803    sctx->emit_spi_map[20] = si_emit_spi_map<20>;
2804    sctx->emit_spi_map[21] = si_emit_spi_map<21>;
2805    sctx->emit_spi_map[22] = si_emit_spi_map<22>;
2806    sctx->emit_spi_map[23] = si_emit_spi_map<23>;
2807    sctx->emit_spi_map[24] = si_emit_spi_map<24>;
2808    sctx->emit_spi_map[25] = si_emit_spi_map<25>;
2809    sctx->emit_spi_map[26] = si_emit_spi_map<26>;
2810    sctx->emit_spi_map[27] = si_emit_spi_map<27>;
2811    sctx->emit_spi_map[28] = si_emit_spi_map<28>;
2812    sctx->emit_spi_map[29] = si_emit_spi_map<29>;
2813    sctx->emit_spi_map[30] = si_emit_spi_map<30>;
2814    sctx->emit_spi_map[31] = si_emit_spi_map<31>;
2815    sctx->emit_spi_map[32] = si_emit_spi_map<32>;
2816 }
2817 
2818 #endif
2819