• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28 #include "genxml/genX_rt_pack.h"
29 
30 #include "common/intel_compute_slm.h"
31 #include "common/intel_genX_state_brw.h"
32 #include "common/intel_l3_config.h"
33 #include "common/intel_sample_positions.h"
34 #include "nir/nir_xfb_info.h"
35 #include "vk_util.h"
36 #include "vk_format.h"
37 #include "vk_log.h"
38 #include "vk_render_pass.h"
39 
40 static inline struct anv_batch *
anv_gfx_pipeline_add(struct anv_graphics_pipeline * pipeline,struct anv_gfx_state_ptr * ptr,uint32_t n_dwords)41 anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
42                      struct anv_gfx_state_ptr *ptr,
43                      uint32_t n_dwords)
44 {
45    struct anv_batch *batch = &pipeline->base.base.batch;
46 
47    assert(ptr->len == 0 ||
48           (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
49    if (ptr->len == 0)
50       ptr->offset = (batch->next - batch->start) / 4;
51    ptr->len += n_dwords;
52 
53    return batch;
54 }
55 
56 #define anv_pipeline_emit_tmp(pipeline, field, cmd, name)               \
57    for (struct cmd name = { __anv_cmd_header(cmd) },                    \
58            *_dst = (void *) field;                                      \
59         __builtin_expect(_dst != NULL, 1);                              \
60         ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch,            \
61                                _dst, &name);                            \
62            VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
63            _dst = NULL;                                                 \
64         }))
65 
66 #define anv_pipeline_emit(pipeline, state, cmd, name)                   \
67    for (struct cmd name = { __anv_cmd_header(cmd) },                    \
68            *_dst = anv_batch_emit_dwords(                               \
69               anv_gfx_pipeline_add(pipeline,                            \
70                                    &(pipeline)->state,                  \
71                                    __anv_cmd_length(cmd)),              \
72               __anv_cmd_length(cmd));                                   \
73         __builtin_expect(_dst != NULL, 1);                              \
74         ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch,            \
75                                _dst, &name);                            \
76            VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
77            _dst = NULL;                                                 \
78         }))
79 
80 #define anv_pipeline_emit_merge(pipeline, state, dwords, cmd, name) \
81    for (struct cmd name = { 0 },                                        \
82            *_dst = anv_batch_emit_dwords(                               \
83               anv_gfx_pipeline_add(pipeline,                            \
84                                    &(pipeline)->state,                  \
85                                    __anv_cmd_length(cmd)),              \
86               __anv_cmd_length(cmd));                                   \
87         __builtin_expect(_dst != NULL, 1);                              \
88         ({ uint32_t _partial[__anv_cmd_length(cmd)];                    \
89            assert((pipeline)->state.len == __anv_cmd_length(cmd));      \
90            __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch,            \
91                                _partial, &name);                        \
92            for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
93               ((uint32_t *)_dst)[i] = _partial[i] | dwords[i];          \
94            }                                                            \
95            VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
96            _dst = NULL;                                                 \
97          }))
98 
99 #define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({             \
100    void *__dst = anv_batch_emit_dwords(                                 \
101       anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n);        \
102    if (__dst) {                                                         \
103       struct cmd __template = {                                         \
104          __anv_cmd_header(cmd),                                         \
105          .DWordLength = n - __anv_cmd_length_bias(cmd),                 \
106          __VA_ARGS__                                                    \
107       };                                                                \
108       __anv_cmd_pack(cmd)(&pipeline->base.base.batch,                   \
109                           __dst, &__template);                          \
110    }                                                                    \
111    __dst;                                                               \
112    })
113 
114 #define pipeline_needs_protected(pipeline) \
115    ((pipeline)->device->vk.enabled_features.protectedMemory)
116 
117 static uint32_t
vertex_element_comp_control(enum isl_format format,unsigned comp)118 vertex_element_comp_control(enum isl_format format, unsigned comp)
119 {
120    uint8_t bits;
121    switch (comp) {
122    case 0: bits = isl_format_layouts[format].channels.r.bits; break;
123    case 1: bits = isl_format_layouts[format].channels.g.bits; break;
124    case 2: bits = isl_format_layouts[format].channels.b.bits; break;
125    case 3: bits = isl_format_layouts[format].channels.a.bits; break;
126    default: unreachable("Invalid component");
127    }
128 
129    /*
130     * Take in account hardware restrictions when dealing with 64-bit floats.
131     *
132     * From Broadwell spec, command reference structures, page 586:
133     *  "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
134     *   64-bit components are stored * in the URB without any conversion. In
135     *   this case, vertex elements must be written as 128 or 256 bits, with
136     *   VFCOMP_STORE_0 being used to pad the output as required. E.g., if
137     *   R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
138     *   Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
139     *   set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
140     *   Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
141     *   a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
142     *   Component 3 to be specified as VFCOMP_STORE_0 in order to output a
143     *   256-bit vertex element."
144     */
145    if (bits) {
146       return VFCOMP_STORE_SRC;
147    } else if (comp >= 2 &&
148               !isl_format_layouts[format].channels.b.bits &&
149               isl_format_layouts[format].channels.r.type == ISL_RAW) {
150       /* When emitting 64-bit attributes, we need to write either 128 or 256
151        * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
152        * VFCOMP_STORE_0 to pad the written chunk */
153       return VFCOMP_NOSTORE;
154    } else if (comp < 3 ||
155               isl_format_layouts[format].channels.r.type == ISL_RAW) {
156       /* Note we need to pad with value 0, not 1, due hardware restrictions
157        * (see comment above) */
158       return VFCOMP_STORE_0;
159    } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
160             isl_format_layouts[format].channels.r.type == ISL_SINT) {
161       assert(comp == 3);
162       return VFCOMP_STORE_1_INT;
163    } else {
164       assert(comp == 3);
165       return VFCOMP_STORE_1_FP;
166    }
167 }
168 
169 static void
emit_ves_vf_instancing(struct anv_batch * batch,uint32_t * vertex_element_dws,struct anv_graphics_pipeline * pipeline,const struct vk_vertex_input_state * vi,bool emit_in_pipeline)170 emit_ves_vf_instancing(struct anv_batch *batch,
171                        uint32_t *vertex_element_dws,
172                        struct anv_graphics_pipeline *pipeline,
173                        const struct vk_vertex_input_state *vi,
174                        bool emit_in_pipeline)
175 {
176    const struct anv_device *device = pipeline->base.base.device;
177    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
178    const uint64_t inputs_read = vs_prog_data->inputs_read;
179    const uint64_t double_inputs_read =
180       vs_prog_data->double_inputs_read & inputs_read;
181    assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
182    const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
183    const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
184 
185    for (uint32_t i = 0; i < pipeline->vs_input_elements; i++) {
186       /* The SKL docs for VERTEX_ELEMENT_STATE say:
187        *
188        *    "All elements must be valid from Element[0] to the last valid
189        *    element. (I.e. if Element[2] is valid then Element[1] and
190        *    Element[0] must also be valid)."
191        *
192        * The SKL docs for 3D_Vertex_Component_Control say:
193        *
194        *    "Don't store this component. (Not valid for Component 0, but can
195        *    be used for Component 1-3)."
196        *
197        * So we can't just leave a vertex element blank and hope for the best.
198        * We have to tell the VF hardware to put something in it; so we just
199        * store a bunch of zero.
200        *
201        * TODO: Compact vertex elements so we never end up with holes.
202        */
203       struct GENX(VERTEX_ELEMENT_STATE) element = {
204          .Valid = true,
205          .Component0Control = VFCOMP_STORE_0,
206          .Component1Control = VFCOMP_STORE_0,
207          .Component2Control = VFCOMP_STORE_0,
208          .Component3Control = VFCOMP_STORE_0,
209       };
210       GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
211                                       &vertex_element_dws[i * 2],
212                                       &element);
213    }
214 
215    u_foreach_bit(a, vi->attributes_valid) {
216       enum isl_format format = anv_get_isl_format(device->physical,
217                                                   vi->attributes[a].format,
218                                                   VK_IMAGE_ASPECT_COLOR_BIT,
219                                                   VK_IMAGE_TILING_LINEAR);
220       assume(format < ISL_NUM_FORMATS);
221 
222       uint32_t binding = vi->attributes[a].binding;
223       assert(binding < MAX_VBS);
224 
225       if ((elements & (1 << a)) == 0)
226          continue; /* Binding unused */
227 
228       uint32_t slot =
229          __builtin_popcount(elements & ((1 << a) - 1)) -
230          DIV_ROUND_UP(__builtin_popcount(elements_double &
231                                         ((1 << a) -1)), 2);
232 
233       struct GENX(VERTEX_ELEMENT_STATE) element = {
234          .VertexBufferIndex = vi->attributes[a].binding,
235          .Valid = true,
236          .SourceElementFormat = format,
237          .EdgeFlagEnable = false,
238          .SourceElementOffset = vi->attributes[a].offset,
239          .Component0Control = vertex_element_comp_control(format, 0),
240          .Component1Control = vertex_element_comp_control(format, 1),
241          .Component2Control = vertex_element_comp_control(format, 2),
242          .Component3Control = vertex_element_comp_control(format, 3),
243       };
244       GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
245                                       &vertex_element_dws[slot * 2],
246                                       &element);
247 
248       /* On Broadwell and later, we have a separate VF_INSTANCING packet
249        * that controls instancing.  On Haswell and prior, that's part of
250        * VERTEX_BUFFER_STATE which we emit later.
251        */
252       if (emit_in_pipeline) {
253          anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
254             bool per_instance = vi->bindings[binding].input_rate ==
255                VK_VERTEX_INPUT_RATE_INSTANCE;
256             uint32_t divisor = vi->bindings[binding].divisor *
257                pipeline->instance_multiplier;
258 
259             vfi.InstancingEnable = per_instance;
260             vfi.VertexElementIndex = slot;
261             vfi.InstanceDataStepRate = per_instance ? divisor : 1;
262          }
263       } else {
264          anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
265             bool per_instance = vi->bindings[binding].input_rate ==
266                VK_VERTEX_INPUT_RATE_INSTANCE;
267             uint32_t divisor = vi->bindings[binding].divisor *
268                pipeline->instance_multiplier;
269 
270             vfi.InstancingEnable = per_instance;
271             vfi.VertexElementIndex = slot;
272             vfi.InstanceDataStepRate = per_instance ? divisor : 1;
273          }
274       }
275    }
276 }
277 
278 void
genX(batch_emit_vertex_input)279 genX(batch_emit_vertex_input)(struct anv_batch *batch,
280                               struct anv_device *device,
281                               struct anv_graphics_pipeline *pipeline,
282                               const struct vk_vertex_input_state *vi)
283 {
284    const uint32_t ve_count =
285       pipeline->vs_input_elements + pipeline->svgs_count;
286    const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count);
287    uint32_t *p = anv_batch_emitn(batch, num_dwords,
288                                  GENX(3DSTATE_VERTEX_ELEMENTS));
289    if (p == NULL)
290       return;
291 
292    if (ve_count == 0) {
293       memcpy(p + 1, device->physical->empty_vs_input,
294              sizeof(device->physical->empty_vs_input));
295    } else if (ve_count == pipeline->vertex_input_elems) {
296       /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so everything is
297        * in pipeline->vertex_input_data and we can just memcpy
298        */
299       memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count);
300       anv_batch_emit_pipeline_state(batch, pipeline, final.vf_instancing);
301    } else {
302       assert(pipeline->final.vf_instancing.len == 0);
303       /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
304       emit_ves_vf_instancing(batch, p + 1, pipeline, vi,
305                              false /* emit_in_pipeline */);
306       /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
307       memcpy(p + 1 + 2 * pipeline->vs_input_elements,
308              pipeline->vertex_input_data,
309              4 * 2 * pipeline->vertex_input_elems);
310    }
311 }
312 
313 static void
emit_vertex_input(struct anv_graphics_pipeline * pipeline,const struct vk_graphics_pipeline_state * state,const struct vk_vertex_input_state * vi)314 emit_vertex_input(struct anv_graphics_pipeline *pipeline,
315                   const struct vk_graphics_pipeline_state *state,
316                   const struct vk_vertex_input_state *vi)
317 {
318    /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
319     * everything in gfx8_cmd_buffer.c
320     */
321    if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
322       emit_ves_vf_instancing(NULL,
323                              pipeline->vertex_input_data,
324                              pipeline, vi, true /* emit_in_pipeline */);
325    }
326 
327    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
328    const bool needs_svgs_elem = pipeline->svgs_count > 1 ||
329                                 !vs_prog_data->uses_drawid;
330    const uint32_t id_slot = pipeline->vs_input_elements;
331    const uint32_t drawid_slot = id_slot + needs_svgs_elem;
332    if (pipeline->svgs_count > 0) {
333       assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
334       uint32_t slot_offset =
335          pipeline->vertex_input_elems - pipeline->svgs_count;
336 
337       if (needs_svgs_elem) {
338 #if GFX_VER < 11
339          /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
340           *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
341           *    Control field is set to something other than VFCOMP_STORE_SRC,
342           *    no higher-numbered Component Control fields may be set to
343           *    VFCOMP_STORE_SRC"
344           *
345           * This means, that if we have BaseInstance, we need BaseVertex as
346           * well.  Just do all or nothing.
347           */
348          uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
349                                vs_prog_data->uses_baseinstance) ?
350                               VFCOMP_STORE_SRC : VFCOMP_STORE_0;
351 #endif
352 
353          struct GENX(VERTEX_ELEMENT_STATE) element = {
354             .VertexBufferIndex = ANV_SVGS_VB_INDEX,
355             .Valid = true,
356             .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
357 #if GFX_VER >= 11
358             /* On gen11, these are taken care of by extra parameter slots */
359             .Component0Control = VFCOMP_STORE_0,
360             .Component1Control = VFCOMP_STORE_0,
361 #else
362             .Component0Control = base_ctrl,
363             .Component1Control = base_ctrl,
364 #endif
365             .Component2Control = VFCOMP_STORE_0,
366             .Component3Control = VFCOMP_STORE_0,
367          };
368          GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
369                                          &pipeline->vertex_input_data[slot_offset * 2],
370                                          &element);
371          slot_offset++;
372 
373          anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
374                            GENX(3DSTATE_VF_INSTANCING), vfi) {
375             vfi.VertexElementIndex = id_slot;
376          }
377       }
378 
379       if (vs_prog_data->uses_drawid) {
380          struct GENX(VERTEX_ELEMENT_STATE) element = {
381             .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
382             .Valid = true,
383             .SourceElementFormat = ISL_FORMAT_R32_UINT,
384 #if GFX_VER >= 11
385             /* On gen11, this is taken care of by extra parameter slots */
386             .Component0Control = VFCOMP_STORE_0,
387 #else
388             .Component0Control = VFCOMP_STORE_SRC,
389 #endif
390             .Component1Control = VFCOMP_STORE_0,
391             .Component2Control = VFCOMP_STORE_0,
392             .Component3Control = VFCOMP_STORE_0,
393          };
394          GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
395                                          &pipeline->vertex_input_data[slot_offset * 2],
396                                          &element);
397          slot_offset++;
398 
399          anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
400                            GENX(3DSTATE_VF_INSTANCING), vfi) {
401             vfi.VertexElementIndex = drawid_slot;
402          }
403       }
404    }
405 
406    anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
407       sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
408       sgvs.VertexIDComponentNumber     = 2;
409       sgvs.VertexIDElementOffset       = id_slot;
410       sgvs.InstanceIDEnable            = vs_prog_data->uses_instanceid;
411       sgvs.InstanceIDComponentNumber   = 3;
412       sgvs.InstanceIDElementOffset     = id_slot;
413    }
414 
415 #if GFX_VER >= 11
416    anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
417       /* gl_BaseVertex */
418       sgvs.XP0Enable                   = vs_prog_data->uses_firstvertex;
419       sgvs.XP0SourceSelect             = XP0_PARAMETER;
420       sgvs.XP0ComponentNumber          = 0;
421       sgvs.XP0ElementOffset            = id_slot;
422 
423       /* gl_BaseInstance */
424       sgvs.XP1Enable                   = vs_prog_data->uses_baseinstance;
425       sgvs.XP1SourceSelect             = StartingInstanceLocation;
426       sgvs.XP1ComponentNumber          = 1;
427       sgvs.XP1ElementOffset            = id_slot;
428 
429       /* gl_DrawID */
430       sgvs.XP2Enable                   = vs_prog_data->uses_drawid;
431       sgvs.XP2ComponentNumber          = 0;
432       sgvs.XP2ElementOffset            = drawid_slot;
433    }
434 #endif
435 }
436 
437 void
genX(emit_urb_setup)438 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
439                      const struct intel_l3_config *l3_config,
440                      VkShaderStageFlags active_stages,
441                      const struct intel_urb_config *urb_cfg_in,
442                      struct intel_urb_config *urb_cfg_out,
443                      enum intel_urb_deref_block_size *deref_block_size)
444 {
445    const struct intel_device_info *devinfo = device->info;
446 
447    bool constrained;
448    intel_get_urb_config(devinfo, l3_config,
449                         active_stages &
450                            VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
451                         active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
452                         urb_cfg_out, deref_block_size,
453                         &constrained);
454 
455 #if INTEL_NEEDS_WA_16014912113
456       if (intel_urb_setup_changed(urb_cfg_in, urb_cfg_out,
457           MESA_SHADER_TESS_EVAL) && urb_cfg_in->size[0] != 0) {
458          for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
459 #if GFX_VER >= 12
460             anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
461                urb._3DCommandSubOpcode             += i;
462                urb.VSURBEntryAllocationSize        = urb_cfg_in->size[i] - 1;
463                urb.VSURBStartingAddressSlice0      = urb_cfg_in->start[i];
464                urb.VSURBStartingAddressSliceN      = urb_cfg_in->start[i];
465                urb.VSNumberofURBEntriesSlice0      = i == 0 ? 256 : 0;
466                urb.VSNumberofURBEntriesSliceN      = i == 0 ? 256 : 0;
467             }
468 #else
469             anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
470                urb._3DCommandSubOpcode      += i;
471                urb.VSURBStartingAddress      = urb_cfg_in->start[i];
472                urb.VSURBEntryAllocationSize  = urb_cfg_in->size[i] - 1;
473                urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
474             }
475 #endif
476          }
477          genx_batch_emit_pipe_control(batch, device->info, _3D,
478                                       ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
479       }
480 #endif
481 
482    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
483 #if GFX_VER >= 12
484       anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
485          urb._3DCommandSubOpcode             += i;
486          urb.VSURBEntryAllocationSize        = urb_cfg_out->size[i] - 1;
487          urb.VSURBStartingAddressSlice0      = urb_cfg_out->start[i];
488          urb.VSURBStartingAddressSliceN      = urb_cfg_out->start[i];
489          urb.VSNumberofURBEntriesSlice0      = urb_cfg_out->entries[i];
490          urb.VSNumberofURBEntriesSliceN      = urb_cfg_out->entries[i];
491       }
492 #else
493       anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
494          urb._3DCommandSubOpcode      += i;
495          urb.VSURBStartingAddress      = urb_cfg_out->start[i];
496          urb.VSURBEntryAllocationSize  = urb_cfg_out->size[i] - 1;
497          urb.VSNumberofURBEntries      = urb_cfg_out->entries[i];
498       }
499 #endif
500    }
501 
502 #if GFX_VERx10 >= 125
503    if (device->vk.enabled_extensions.EXT_mesh_shader) {
504       anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
505       anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
506    }
507 #endif
508 }
509 
510 #if GFX_VERx10 >= 125
511 static void
emit_urb_setup_mesh(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)512 emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
513                     enum intel_urb_deref_block_size *deref_block_size)
514 {
515    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
516 
517    const struct brw_task_prog_data *task_prog_data =
518       anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ?
519       get_task_prog_data(pipeline) : NULL;
520    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
521 
522    const struct intel_mesh_urb_allocation alloc =
523       intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config,
524                                 task_prog_data ? task_prog_data->map.size_dw : 0,
525                                 mesh_prog_data->map.size_dw);
526 
527    /* Zero out the primitive pipeline URB allocations. */
528    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
529 #if GFX_VER >= 12
530       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_VS), urb) {
531          urb._3DCommandSubOpcode += i;
532       }
533 #else
534       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
535          urb._3DCommandSubOpcode += i;
536       }
537 #endif
538    }
539 
540    anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
541       if (task_prog_data) {
542          urb.TASKURBEntryAllocationSize   = alloc.task_entry_size_64b - 1;
543          urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
544          urb.TASKNumberofURBEntriesSliceN = alloc.task_entries;
545          urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb;
546          urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb;
547       }
548    }
549 
550    anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
551       urb.MESHURBEntryAllocationSize   = alloc.mesh_entry_size_64b - 1;
552       urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
553       urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
554       urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb;
555       urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb;
556    }
557 
558    *deref_block_size = alloc.deref_block_size;
559 }
560 #endif
561 
562 static void
emit_urb_setup(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)563 emit_urb_setup(struct anv_graphics_pipeline *pipeline,
564                enum intel_urb_deref_block_size *deref_block_size)
565 {
566 #if GFX_VERx10 >= 125
567    if (anv_pipeline_is_mesh(pipeline)) {
568       emit_urb_setup_mesh(pipeline, deref_block_size);
569       return;
570    }
571 #endif
572    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
573       const struct brw_vue_prog_data *prog_data =
574          !anv_pipeline_has_stage(pipeline, i) ? NULL :
575          (const struct brw_vue_prog_data *) pipeline->base.shaders[i]->prog_data;
576 
577       pipeline->urb_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
578    }
579 
580    struct anv_device *device = pipeline->base.base.device;
581    const struct intel_device_info *devinfo = device->info;
582 
583 
584    bool constrained;
585    intel_get_urb_config(devinfo,
586                         pipeline->base.base.l3_config,
587                         pipeline->base.base.active_stages &
588                            VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
589                         pipeline->base.base.active_stages &
590                            VK_SHADER_STAGE_GEOMETRY_BIT,
591                         &pipeline->urb_cfg, deref_block_size,
592                         &constrained);
593 
594    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
595 #if GFX_VER >= 12
596       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_VS), urb) {
597          urb._3DCommandSubOpcode          += i;
598          urb.VSURBEntryAllocationSize      = pipeline->urb_cfg.size[i] - 1;
599          urb.VSURBStartingAddressSlice0    = pipeline->urb_cfg.start[i];
600          urb.VSURBStartingAddressSliceN    = pipeline->urb_cfg.start[i];
601          urb.VSNumberofURBEntriesSlice0    = pipeline->urb_cfg.entries[i];
602          urb.VSNumberofURBEntriesSliceN    = pipeline->urb_cfg.entries[i];
603       }
604 #else
605       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
606          urb._3DCommandSubOpcode      += i;
607          urb.VSURBStartingAddress      = pipeline->urb_cfg.start[i];
608          urb.VSURBEntryAllocationSize  = pipeline->urb_cfg.size[i] - 1;
609          urb.VSNumberofURBEntries      = pipeline->urb_cfg.entries[i];
610       }
611 #endif
612    }
613 
614 #if GFX_VERx10 >= 125
615    if (device->vk.enabled_extensions.EXT_mesh_shader) {
616       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
617       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
618    }
619 #endif
620 
621 }
622 
623 static bool
sbe_primitive_id_override(struct anv_graphics_pipeline * pipeline)624 sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline)
625 {
626    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
627    if (!wm_prog_data)
628       return false;
629 
630    if (anv_pipeline_is_mesh(pipeline)) {
631       const struct brw_mesh_prog_data *mesh_prog_data =
632          get_mesh_prog_data(pipeline);
633       const struct brw_mue_map *mue = &mesh_prog_data->map;
634       return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
635               mue->start_dw[VARYING_SLOT_PRIMITIVE_ID] == -1;
636    }
637 
638    const struct intel_vue_map *fs_input_map =
639       &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
640 
641    return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
642           fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1;
643 }
644 
645 static void
emit_3dstate_sbe(struct anv_graphics_pipeline * pipeline)646 emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
647 {
648    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
649 
650    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
651       anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
652       anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
653 #if GFX_VERx10 >= 125
654       if (anv_pipeline_is_mesh(pipeline))
655          anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
656 #endif
657       return;
658    }
659 
660    anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
661    anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
662 
663       /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
664        * calculate_urb_setup() and related functions.
665        */
666       sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
667       sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
668       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
669       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
670 
671       for (unsigned i = 0; i < 32; i++)
672          sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
673 
674       if (anv_pipeline_is_primitive(pipeline)) {
675          const struct intel_vue_map *fs_input_map =
676             &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
677 
678          int first_slot =
679             brw_compute_first_urb_slot_required(wm_prog_data->inputs,
680                                                 fs_input_map);
681          assert(first_slot % 2 == 0);
682          unsigned urb_entry_read_offset = first_slot / 2;
683          int max_source_attr = 0;
684          for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
685             uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
686             int input_index = wm_prog_data->urb_setup[attr];
687 
688             assert(0 <= input_index);
689 
690             /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
691              * VUE header
692              */
693             if (attr == VARYING_SLOT_VIEWPORT ||
694                 attr == VARYING_SLOT_LAYER ||
695                 attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
696                continue;
697             }
698 
699             if (attr == VARYING_SLOT_PNTC) {
700                sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
701                continue;
702             }
703 
704             const int slot = fs_input_map->varying_to_slot[attr];
705 
706             if (slot == -1) {
707                /* This attribute does not exist in the VUE--that means that
708                 * the vertex shader did not write to it. It could be that it's
709                 * a regular varying read by the fragment shader but not
710                 * written by the vertex shader or it's gl_PrimitiveID. In the
711                 * first case the value is undefined, in the second it needs to
712                 * be gl_PrimitiveID.
713                 */
714                swiz.Attribute[input_index].ConstantSource = PRIM_ID;
715                swiz.Attribute[input_index].ComponentOverrideX = true;
716                swiz.Attribute[input_index].ComponentOverrideY = true;
717                swiz.Attribute[input_index].ComponentOverrideZ = true;
718                swiz.Attribute[input_index].ComponentOverrideW = true;
719                continue;
720             }
721 
722             /* We have to subtract two slots to account for the URB entry
723              * output read offset in the VS and GS stages.
724              */
725             const int source_attr = slot - 2 * urb_entry_read_offset;
726             assert(source_attr >= 0 && source_attr < 32);
727             max_source_attr = MAX2(max_source_attr, source_attr);
728             /* The hardware can only do overrides on 16 overrides at a time,
729              * and the other up to 16 have to be lined up so that the input
730              * index = the output index. We'll need to do some tweaking to
731              * make sure that's the case.
732              */
733             if (input_index < 16)
734                swiz.Attribute[input_index].SourceAttribute = source_attr;
735             else
736                assert(source_attr == input_index);
737          }
738 
739          sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
740          sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
741          sbe.ForceVertexURBEntryReadOffset = true;
742          sbe.ForceVertexURBEntryReadLength = true;
743 
744          /* Ask the hardware to supply PrimitiveID if the fragment shader
745           * reads it but a previous stage didn't write one.
746           */
747          if (sbe_primitive_id_override(pipeline)) {
748             sbe.PrimitiveIDOverrideAttributeSelect =
749                wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
750             sbe.PrimitiveIDOverrideComponentX = true;
751             sbe.PrimitiveIDOverrideComponentY = true;
752             sbe.PrimitiveIDOverrideComponentZ = true;
753             sbe.PrimitiveIDOverrideComponentW = true;
754          }
755       } else {
756          assert(anv_pipeline_is_mesh(pipeline));
757 #if GFX_VERx10 >= 125
758          const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
759          anv_pipeline_emit(pipeline, final.sbe_mesh,
760                            GENX(3DSTATE_SBE_MESH), sbe_mesh) {
761             const struct brw_mue_map *mue = &mesh_prog_data->map;
762 
763             assert(mue->per_vertex_header_size_dw % 8 == 0);
764             sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
765             sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
766 
767             /* Clip distance array is passed in the per-vertex header so that
768              * it can be consumed by the HW. If user wants to read it in the
769              * FS, adjust the offset and length to cover it. Conveniently it
770              * is at the end of the per-vertex header, right before per-vertex
771              * attributes.
772              *
773              * Note that FS attribute reading must be aware that the clip
774              * distances have fixed position.
775              */
776             if (mue->per_vertex_header_size_dw > 8 &&
777                 (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
778                  wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
779                sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
780                sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
781             }
782 
783             if (mue->user_data_in_vertex_header) {
784                sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
785                sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
786             }
787 
788             assert(mue->per_primitive_header_size_dw % 8 == 0);
789             sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
790                mue->per_primitive_header_size_dw / 8;
791             sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
792                DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
793 
794             /* Just like with clip distances, if Primitive Shading Rate,
795              * Viewport Index or Layer is read back in the FS, adjust the
796              * offset and length to cover the Primitive Header, where PSR,
797              * Viewport Index & Layer are stored.
798              */
799             if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
800                 wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
801                 wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
802                 mue->user_data_in_primitive_header) {
803                assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
804                sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
805                sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
806             }
807          }
808 #endif
809       }
810    }
811    }
812 }
813 
814 static void
emit_rs_state(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_rasterization_state * rs,const struct vk_multisample_state * ms,const struct vk_render_pass_state * rp,enum intel_urb_deref_block_size urb_deref_block_size)815 emit_rs_state(struct anv_graphics_pipeline *pipeline,
816               const struct vk_input_assembly_state *ia,
817               const struct vk_rasterization_state *rs,
818               const struct vk_multisample_state *ms,
819               const struct vk_render_pass_state *rp,
820               enum intel_urb_deref_block_size urb_deref_block_size)
821 {
822    anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
823       sf.ViewportTransformEnable = true;
824       sf.StatisticsEnable = true;
825       sf.VertexSubPixelPrecisionSelect = _8Bit;
826       sf.AALineDistanceMode = true;
827 
828 #if GFX_VER >= 12
829       sf.DerefBlockSize = urb_deref_block_size;
830 #endif
831 
832       bool point_from_shader;
833       if (anv_pipeline_is_primitive(pipeline)) {
834          const struct brw_vue_prog_data *last_vue_prog_data =
835             anv_pipeline_get_last_vue_prog_data(pipeline);
836          point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
837       } else {
838          assert(anv_pipeline_is_mesh(pipeline));
839          const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
840          point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
841       }
842 
843       if (point_from_shader) {
844          sf.PointWidthSource = Vertex;
845       } else {
846          sf.PointWidthSource = State;
847          sf.PointWidth = 1.0;
848       }
849    }
850 }
851 
852 static void
emit_3dstate_clip(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)853 emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
854                   const struct vk_input_assembly_state *ia,
855                   const struct vk_viewport_state *vp,
856                   const struct vk_rasterization_state *rs)
857 {
858    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
859    (void) wm_prog_data;
860 
861    anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
862       clip.ClipEnable               = true;
863       clip.StatisticsEnable         = true;
864       clip.EarlyCullEnable          = true;
865       clip.GuardbandClipTestEnable  = true;
866 
867       clip.VertexSubPixelPrecisionSelect = _8Bit;
868       clip.ClipMode = CLIPMODE_NORMAL;
869 
870       clip.MinimumPointWidth = 0.125;
871       clip.MaximumPointWidth = 255.875;
872 
873       /* TODO(mesh): Multiview. */
874       if (anv_pipeline_is_primitive(pipeline)) {
875          const struct brw_vue_prog_data *last =
876             anv_pipeline_get_last_vue_prog_data(pipeline);
877 
878          /* From the Vulkan 1.0.45 spec:
879           *
880           *    "If the last active vertex processing stage shader entry point's
881           *    interface does not include a variable decorated with Layer, then
882           *    the first layer is used."
883           */
884          clip.ForceZeroRTAIndexEnable =
885             !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
886 
887       } else if (anv_pipeline_is_mesh(pipeline)) {
888          const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
889 
890          clip.ForceZeroRTAIndexEnable =
891             mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
892       }
893 
894       clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
895          wm_prog_data->uses_nonperspective_interp_modes : 0;
896    }
897 
898 #if GFX_VERx10 >= 125
899    if (anv_pipeline_is_mesh(pipeline)) {
900       const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
901       anv_pipeline_emit(pipeline, final.clip_mesh,
902                         GENX(3DSTATE_CLIP_MESH), clip_mesh) {
903          clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
904          clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
905          clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
906       }
907    }
908 #endif
909 }
910 
911 static void
emit_3dstate_streamout(struct anv_graphics_pipeline * pipeline,const struct vk_rasterization_state * rs)912 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
913                        const struct vk_rasterization_state *rs)
914 {
915    const struct brw_vue_prog_data *prog_data =
916       anv_pipeline_get_last_vue_prog_data(pipeline);
917    const struct intel_vue_map *vue_map = &prog_data->vue_map;
918 
919    nir_xfb_info *xfb_info;
920    if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
921       xfb_info = pipeline->base.shaders[MESA_SHADER_GEOMETRY]->xfb_info;
922    else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
923       xfb_info = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
924    else
925       xfb_info = pipeline->base.shaders[MESA_SHADER_VERTEX]->xfb_info;
926 
927    if (xfb_info) {
928       struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
929       int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
930       int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
931 
932       memset(so_decl, 0, sizeof(so_decl));
933 
934       for (unsigned i = 0; i < xfb_info->output_count; i++) {
935          const nir_xfb_output_info *output = &xfb_info->outputs[i];
936          unsigned buffer = output->buffer;
937          unsigned stream = xfb_info->buffer_to_stream[buffer];
938 
939          /* Our hardware is unusual in that it requires us to program SO_DECLs
940           * for fake "hole" components, rather than simply taking the offset
941           * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
942           * program as many size = 4 holes as we can, then a final hole to
943           * accommodate the final 1, 2, or 3 remaining.
944           */
945          int hole_dwords = (output->offset - next_offset[buffer]) / 4;
946          while (hole_dwords > 0) {
947             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
948                .HoleFlag = 1,
949                .OutputBufferSlot = buffer,
950                .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
951             };
952             hole_dwords -= 4;
953          }
954 
955          int varying = output->location;
956          uint8_t component_mask = output->component_mask;
957          /* VARYING_SLOT_PSIZ contains four scalar fields packed together:
958           * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x
959           * - VARYING_SLOT_LAYER                  in VARYING_SLOT_PSIZ.y
960           * - VARYING_SLOT_VIEWPORT               in VARYING_SLOT_PSIZ.z
961           * - VARYING_SLOT_PSIZ                   in VARYING_SLOT_PSIZ.w
962           */
963          if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
964             varying = VARYING_SLOT_PSIZ;
965             component_mask = 1 << 0; // SO_DECL_COMPMASK_X
966          } else if (varying == VARYING_SLOT_LAYER) {
967             varying = VARYING_SLOT_PSIZ;
968             component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
969          } else if (varying == VARYING_SLOT_VIEWPORT) {
970             varying = VARYING_SLOT_PSIZ;
971             component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
972          } else if (varying == VARYING_SLOT_PSIZ) {
973             component_mask = 1 << 3; // SO_DECL_COMPMASK_W
974          }
975 
976          next_offset[buffer] = output->offset +
977                                __builtin_popcount(component_mask) * 4;
978 
979          const int slot = vue_map->varying_to_slot[varying];
980          if (slot < 0) {
981             /* This can happen if the shader never writes to the varying.
982              * Insert a hole instead of actual varying data.
983              */
984             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
985                .HoleFlag = true,
986                .OutputBufferSlot = buffer,
987                .ComponentMask = component_mask,
988             };
989          } else {
990             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
991                .OutputBufferSlot = buffer,
992                .RegisterIndex = slot,
993                .ComponentMask = component_mask,
994             };
995          }
996       }
997 
998       int max_decls = 0;
999       for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
1000          max_decls = MAX2(max_decls, decls[s]);
1001 
1002       uint8_t sbs[MAX_XFB_STREAMS] = { };
1003       for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
1004          if (xfb_info->buffers_written & (1 << b))
1005             sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
1006       }
1007 
1008       uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
1009                                         3 + 2 * max_decls,
1010                                         GENX(3DSTATE_SO_DECL_LIST),
1011                                         .StreamtoBufferSelects0 = sbs[0],
1012                                         .StreamtoBufferSelects1 = sbs[1],
1013                                         .StreamtoBufferSelects2 = sbs[2],
1014                                         .StreamtoBufferSelects3 = sbs[3],
1015                                         .NumEntries0 = decls[0],
1016                                         .NumEntries1 = decls[1],
1017                                         .NumEntries2 = decls[2],
1018                                         .NumEntries3 = decls[3]);
1019 
1020       for (int i = 0; i < max_decls; i++) {
1021          GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
1022             &(struct GENX(SO_DECL_ENTRY)) {
1023                .Stream0Decl = so_decl[0][i],
1024                .Stream1Decl = so_decl[1][i],
1025                .Stream2Decl = so_decl[2][i],
1026                .Stream3Decl = so_decl[3][i],
1027             });
1028       }
1029    }
1030 
1031    anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
1032       if (xfb_info) {
1033          pipeline->uses_xfb = true;
1034 
1035          so.SOFunctionEnable = true;
1036          so.SOStatisticsEnable = true;
1037 
1038          so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
1039          so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
1040          so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
1041          so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
1042 
1043          int urb_entry_read_offset = 0;
1044          int urb_entry_read_length =
1045             (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
1046 
1047          /* We always read the whole vertex. This could be reduced at some
1048           * point by reading less and offsetting the register index in the
1049           * SO_DECLs.
1050           */
1051          so.Stream0VertexReadOffset = urb_entry_read_offset;
1052          so.Stream0VertexReadLength = urb_entry_read_length - 1;
1053          so.Stream1VertexReadOffset = urb_entry_read_offset;
1054          so.Stream1VertexReadLength = urb_entry_read_length - 1;
1055          so.Stream2VertexReadOffset = urb_entry_read_offset;
1056          so.Stream2VertexReadLength = urb_entry_read_length - 1;
1057          so.Stream3VertexReadOffset = urb_entry_read_offset;
1058          so.Stream3VertexReadLength = urb_entry_read_length - 1;
1059       }
1060    }
1061 }
1062 
1063 static inline uint32_t
get_sampler_count(const struct anv_shader_bin * bin)1064 get_sampler_count(const struct anv_shader_bin *bin)
1065 {
1066    /* We can potentially have way more than 32 samplers and that's ok.
1067     * However, the 3DSTATE_XS packets only have 3 bits to specify how
1068     * many to pre-fetch and all values above 4 are marked reserved.
1069     */
1070    return DIV_ROUND_UP(CLAMP(bin->bind_map.sampler_count, 0, 16), 4);
1071 }
1072 
1073 static UNUSED struct anv_address
get_scratch_address(struct anv_pipeline * pipeline,gl_shader_stage stage,const struct anv_shader_bin * bin)1074 get_scratch_address(struct anv_pipeline *pipeline,
1075                     gl_shader_stage stage,
1076                     const struct anv_shader_bin *bin)
1077 {
1078    return (struct anv_address) {
1079       .bo = anv_scratch_pool_alloc(pipeline->device,
1080                                    &pipeline->device->scratch_pool,
1081                                    stage, bin->prog_data->total_scratch),
1082       .offset = 0,
1083    };
1084 }
1085 
1086 static UNUSED uint32_t
get_scratch_space(const struct anv_shader_bin * bin)1087 get_scratch_space(const struct anv_shader_bin *bin)
1088 {
1089    return ffs(bin->prog_data->total_scratch / 2048);
1090 }
1091 
1092 static UNUSED uint32_t
get_scratch_surf(struct anv_pipeline * pipeline,gl_shader_stage stage,const struct anv_shader_bin * bin,bool protected)1093 get_scratch_surf(struct anv_pipeline *pipeline,
1094                  gl_shader_stage stage,
1095                  const struct anv_shader_bin *bin,
1096                  bool protected)
1097 {
1098    if (bin->prog_data->total_scratch == 0)
1099       return 0;
1100 
1101    struct anv_scratch_pool *pool = protected ?
1102       &pipeline->device->protected_scratch_pool :
1103       &pipeline->device->scratch_pool;
1104    struct anv_bo *bo =
1105       anv_scratch_pool_alloc(pipeline->device, pool,
1106                              stage, bin->prog_data->total_scratch);
1107    anv_reloc_list_add_bo(pipeline->batch.relocs, bo);
1108    return anv_scratch_pool_get_surf(pipeline->device, pool,
1109                                     bin->prog_data->total_scratch) >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
1110 }
1111 
1112 static void
emit_3dstate_vs(struct anv_graphics_pipeline * pipeline)1113 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
1114 {
1115    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1116    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1117    const struct anv_shader_bin *vs_bin =
1118       pipeline->base.shaders[MESA_SHADER_VERTEX];
1119 
1120    assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
1121 
1122    uint32_t vs_dwords[GENX(3DSTATE_VS_length)];
1123    anv_pipeline_emit_tmp(pipeline, vs_dwords, GENX(3DSTATE_VS), vs) {
1124       vs.Enable               = true;
1125       vs.StatisticsEnable     = true;
1126       vs.KernelStartPointer   = vs_bin->kernel.offset;
1127 #if GFX_VER < 20
1128       vs.SIMD8DispatchEnable  =
1129          vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
1130 #endif
1131 
1132       assert(!vs_prog_data->base.base.use_alt_mode);
1133 #if GFX_VER < 11
1134       vs.SingleVertexDispatch       = false;
1135 #endif
1136       vs.VectorMaskEnable           = false;
1137       /* Wa_1606682166:
1138        * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
1139        * Disable the Sampler state prefetch functionality in the SARB by
1140        * programming 0xB000[30] to '1'.
1141        */
1142       vs.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
1143       vs.BindingTableEntryCount     = vs_bin->bind_map.surface_count;
1144       vs.FloatingPointMode          = IEEE754;
1145       vs.IllegalOpcodeExceptionEnable = false;
1146       vs.SoftwareExceptionEnable    = false;
1147       vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;
1148 
1149       if (GFX_VER == 9 && devinfo->gt == 4 &&
1150           anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1151          /* On Sky Lake GT4, we have experienced some hangs related to the VS
1152           * cache and tessellation.  It is unknown exactly what is happening
1153           * but the Haswell docs for the "VS Reference Count Full Force Miss
1154           * Enable" field of the "Thread Mode" register refer to a HSW bug in
1155           * which the VUE handle reference count would overflow resulting in
1156           * internal reference counting bugs.  My (Faith's) best guess is that
1157           * this bug cropped back up on SKL GT4 when we suddenly had more
1158           * threads in play than any previous gfx9 hardware.
1159           *
1160           * What we do know for sure is that setting this bit when
1161           * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
1162           * City when playing with DXVK (https://bugs.freedesktop.org/107280).
1163           * Disabling the vertex cache with tessellation shaders should only
1164           * have a minor performance impact as the tessellation shaders are
1165           * likely generating and processing far more geometry than the vertex
1166           * stage.
1167           */
1168          vs.VertexCacheDisable = true;
1169       }
1170 
1171       vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
1172       vs.VertexURBEntryReadOffset      = 0;
1173       vs.DispatchGRFStartRegisterForURBData =
1174          vs_prog_data->base.base.dispatch_grf_start_reg;
1175 
1176       vs.UserClipDistanceClipTestEnableBitmask =
1177          vs_prog_data->base.clip_distance_mask;
1178       vs.UserClipDistanceCullTestEnableBitmask =
1179          vs_prog_data->base.cull_distance_mask;
1180 
1181 #if GFX_VERx10 < 125
1182       vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
1183       vs.ScratchSpaceBasePointer =
1184          get_scratch_address(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
1185 #endif
1186 
1187 #if GFX_VER >= 30
1188       vs.RegistersPerThread = ptl_register_blocks(vs_prog_data->base.base.grf_used);
1189 #endif
1190    }
1191 
1192    anv_pipeline_emit_merge(pipeline, final.vs, vs_dwords, GENX(3DSTATE_VS), vs) {
1193 #if GFX_VERx10 >= 125
1194       vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1195                                                MESA_SHADER_VERTEX,
1196                                                vs_bin, false);
1197 #endif
1198    }
1199    if (pipeline_needs_protected(&pipeline->base.base)) {
1200       anv_pipeline_emit_merge(pipeline, final.vs_protected,
1201                               vs_dwords, GENX(3DSTATE_VS), vs) {
1202 #if GFX_VERx10 >= 125
1203          vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1204                                                   MESA_SHADER_VERTEX,
1205                                                   vs_bin, true);
1206 #endif
1207       }
1208    }
1209 }
1210 
1211 static void
emit_3dstate_hs_ds(struct anv_graphics_pipeline * pipeline,const struct vk_tessellation_state * ts)1212 emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
1213                    const struct vk_tessellation_state *ts)
1214 {
1215    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1216       anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
1217       anv_pipeline_emit(pipeline, final.hs_protected, GENX(3DSTATE_HS), hs);
1218       anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
1219       anv_pipeline_emit(pipeline, final.ds_protected, GENX(3DSTATE_DS), ds);
1220       return;
1221    }
1222 
1223    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1224    const struct anv_shader_bin *tcs_bin =
1225       pipeline->base.shaders[MESA_SHADER_TESS_CTRL];
1226    const struct anv_shader_bin *tes_bin =
1227       pipeline->base.shaders[MESA_SHADER_TESS_EVAL];
1228 
1229    const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1230    const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1231 
1232    uint32_t hs_dwords[GENX(3DSTATE_HS_length)];
1233    anv_pipeline_emit_tmp(pipeline, hs_dwords, GENX(3DSTATE_HS), hs) {
1234       hs.Enable = true;
1235       hs.StatisticsEnable = true;
1236       hs.KernelStartPointer = tcs_bin->kernel.offset;
1237       /* Wa_1606682166 */
1238       hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
1239       hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
1240 
1241 #if GFX_VER >= 12
1242       /* Wa_1604578095:
1243        *
1244        *    Hang occurs when the number of max threads is less than 2 times
1245        *    the number of instance count. The number of max threads must be
1246        *    more than 2 times the number of instance count.
1247        */
1248       assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
1249 #endif
1250 
1251       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1252       hs.IncludeVertexHandles = true;
1253       hs.InstanceCount = tcs_prog_data->instances - 1;
1254 
1255       hs.VertexURBEntryReadLength = 0;
1256       hs.VertexURBEntryReadOffset = 0;
1257       hs.DispatchGRFStartRegisterForURBData =
1258          tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
1259 #if GFX_VER >= 12
1260       hs.DispatchGRFStartRegisterForURBData5 =
1261          tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
1262 #endif
1263 
1264 #if GFX_VERx10 < 125
1265       hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1266       hs.ScratchSpaceBasePointer =
1267          get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
1268 #endif
1269 
1270 #if GFX_VER == 12
1271       /*  Patch Count threshold specifies the maximum number of patches that
1272        *  will be accumulated before a thread dispatch is forced.
1273        */
1274       hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
1275 #endif
1276 
1277 #if GFX_VER < 20
1278       hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
1279 #endif
1280       hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
1281 
1282 #if GFX_VER >= 30
1283       hs.RegistersPerThread = ptl_register_blocks(tcs_prog_data->base.base.grf_used);
1284 #endif
1285    };
1286 
1287    uint32_t ds_dwords[GENX(3DSTATE_DS_length)];
1288    anv_pipeline_emit_tmp(pipeline, ds_dwords, GENX(3DSTATE_DS), ds) {
1289       ds.Enable = true;
1290       ds.StatisticsEnable = true;
1291       ds.KernelStartPointer = tes_bin->kernel.offset;
1292       /* Wa_1606682166 */
1293       ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
1294       ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
1295       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1296 
1297       ds.ComputeWCoordinateEnable =
1298          tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
1299 
1300       ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1301       ds.PatchURBEntryReadOffset = 0;
1302       ds.DispatchGRFStartRegisterForURBData =
1303          tes_prog_data->base.base.dispatch_grf_start_reg;
1304 
1305 #if GFX_VER < 11
1306       ds.DispatchMode =
1307          tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1308          DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1309          DISPATCH_MODE_SIMD4X2;
1310 #else
1311       assert(tes_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
1312       ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
1313 #endif
1314 
1315       ds.UserClipDistanceClipTestEnableBitmask =
1316          tes_prog_data->base.clip_distance_mask;
1317       ds.UserClipDistanceCullTestEnableBitmask =
1318          tes_prog_data->base.cull_distance_mask;
1319 
1320 #if GFX_VER >= 12
1321       ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
1322 #endif
1323 #if GFX_VERx10 < 125
1324       ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1325       ds.ScratchSpaceBasePointer =
1326          get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
1327 #endif
1328 
1329 #if GFX_VER >= 30
1330       ds.RegistersPerThread = ptl_register_blocks(tes_prog_data->base.base.grf_used);
1331 #endif
1332    }
1333 
1334    anv_pipeline_emit_merge(pipeline, final.hs, hs_dwords, GENX(3DSTATE_HS), hs) {
1335 #if GFX_VERx10 >= 125
1336       hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1337                                                MESA_SHADER_TESS_CTRL,
1338                                                tcs_bin, false);
1339 #endif
1340    }
1341    anv_pipeline_emit_merge(pipeline, final.ds, ds_dwords, GENX(3DSTATE_DS), ds) {
1342 #if GFX_VERx10 >= 125
1343       ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1344                                                MESA_SHADER_TESS_EVAL,
1345                                                tes_bin, false);
1346 #endif
1347    }
1348    if (pipeline_needs_protected(&pipeline->base.base)) {
1349       anv_pipeline_emit_merge(pipeline, final.hs_protected,
1350                               hs_dwords, GENX(3DSTATE_HS), hs) {
1351 #if GFX_VERx10 >= 125
1352          hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1353                                                   MESA_SHADER_TESS_CTRL,
1354                                                   tcs_bin, true);
1355 #endif
1356       }
1357       anv_pipeline_emit_merge(pipeline, final.ds_protected,
1358                               ds_dwords, GENX(3DSTATE_DS), ds) {
1359 #if GFX_VERx10 >= 125
1360          ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1361                                                   MESA_SHADER_TESS_EVAL,
1362                                                   tes_bin, true);
1363 #endif
1364       }
1365    }
1366 }
1367 
1368 static UNUSED bool
geom_or_tess_prim_id_used(struct anv_graphics_pipeline * pipeline)1369 geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
1370 {
1371    const struct brw_tcs_prog_data *tcs_prog_data =
1372       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ?
1373       get_tcs_prog_data(pipeline) : NULL;
1374    const struct brw_tes_prog_data *tes_prog_data =
1375       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ?
1376       get_tes_prog_data(pipeline) : NULL;
1377    const struct brw_gs_prog_data *gs_prog_data =
1378       anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) ?
1379       get_gs_prog_data(pipeline) : NULL;
1380 
1381    return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
1382           (tes_prog_data && tes_prog_data->include_primitive_id) ||
1383           (gs_prog_data && gs_prog_data->include_primitive_id);
1384 }
1385 
1386 static void
emit_3dstate_te(struct anv_graphics_pipeline * pipeline)1387 emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
1388 {
1389    anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
1390       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1391          const struct brw_tes_prog_data *tes_prog_data =
1392             get_tes_prog_data(pipeline);
1393 
1394          te.Partitioning = tes_prog_data->partitioning;
1395          te.TEDomain = tes_prog_data->domain;
1396          te.TEEnable = true;
1397          te.MaximumTessellationFactorOdd = 63.0;
1398          te.MaximumTessellationFactorNotOdd = 64.0;
1399 #if GFX_VERx10 >= 125
1400          const struct anv_device *device = pipeline->base.base.device;
1401          if (intel_needs_workaround(device->info, 22012699309))
1402             te.TessellationDistributionMode = TEDMODE_RR_STRICT;
1403          else
1404             te.TessellationDistributionMode = TEDMODE_RR_FREE;
1405 
1406          if (intel_needs_workaround(device->info, 14015055625)) {
1407             /* Wa_14015055625:
1408              *
1409              * Disable Tessellation Distribution when primitive Id is enabled.
1410              */
1411             if (sbe_primitive_id_override(pipeline) ||
1412                 geom_or_tess_prim_id_used(pipeline))
1413                te.TessellationDistributionMode = TEDMODE_OFF;
1414          }
1415 
1416 #if GFX_VER >= 20
1417          te.TessellationDistributionLevel = TEDLEVEL_REGION;
1418 #else
1419          te.TessellationDistributionLevel = TEDLEVEL_PATCH;
1420 #endif
1421          /* 64_TRIANGLES */
1422          te.SmallPatchThreshold = 3;
1423          /* 1K_TRIANGLES */
1424          te.TargetBlockSize = 8;
1425          /* 1K_TRIANGLES */
1426          te.LocalBOPAccumulatorThreshold = 1;
1427 #endif
1428 
1429 #if GFX_VER >= 20
1430          te.NumberOfRegionsPerPatch = 2;
1431 #endif
1432       }
1433    }
1434 }
1435 
1436 static void
emit_3dstate_gs(struct anv_graphics_pipeline * pipeline)1437 emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
1438 {
1439    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1440       anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
1441       anv_pipeline_emit(pipeline, partial.gs_protected, GENX(3DSTATE_GS), gs);
1442       return;
1443    }
1444 
1445    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1446    const struct anv_shader_bin *gs_bin =
1447       pipeline->base.shaders[MESA_SHADER_GEOMETRY];
1448    const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
1449 
1450    uint32_t gs_dwords[GENX(3DSTATE_GS_length)];
1451    anv_pipeline_emit_tmp(pipeline, gs_dwords, GENX(3DSTATE_GS), gs) {
1452       gs.Enable                  = true;
1453       gs.StatisticsEnable        = true;
1454       gs.KernelStartPointer      = gs_bin->kernel.offset;
1455 #if GFX_VER < 20
1456       gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
1457 #endif
1458 
1459       gs.SingleProgramFlow       = false;
1460       gs.VectorMaskEnable        = false;
1461       /* Wa_1606682166 */
1462       gs.SamplerCount            = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
1463       gs.BindingTableEntryCount  = gs_bin->bind_map.surface_count;
1464       gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
1465       gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
1466 
1467       gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
1468 
1469       gs.OutputVertexSize        = gs_prog_data->output_vertex_size_hwords * 2 - 1;
1470       gs.OutputTopology          = gs_prog_data->output_topology;
1471       gs.ControlDataFormat       = gs_prog_data->control_data_format;
1472       gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
1473       gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
1474 
1475       gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
1476       gs.StaticOutput            = gs_prog_data->static_vertex_count >= 0;
1477       gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
1478          gs_prog_data->static_vertex_count : 0;
1479 
1480       gs.VertexURBEntryReadOffset = 0;
1481       gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1482       gs.DispatchGRFStartRegisterForURBData =
1483          gs_prog_data->base.base.dispatch_grf_start_reg;
1484 
1485       gs.UserClipDistanceClipTestEnableBitmask =
1486          gs_prog_data->base.clip_distance_mask;
1487       gs.UserClipDistanceCullTestEnableBitmask =
1488          gs_prog_data->base.cull_distance_mask;
1489 
1490 #if GFX_VERx10 < 125
1491       gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
1492       gs.ScratchSpaceBasePointer =
1493          get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
1494 #endif
1495 
1496 #if GFX_VER >= 30
1497       gs.RegistersPerThread = ptl_register_blocks(gs_prog_data->base.base.grf_used);
1498 #endif
1499    }
1500 
1501    anv_pipeline_emit_merge(pipeline, partial.gs, gs_dwords, GENX(3DSTATE_GS), gs) {
1502 #if GFX_VERx10 >= 125
1503       gs.ScratchSpaceBuffer =
1504          get_scratch_surf(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin, false);
1505 #endif
1506    }
1507    if (pipeline_needs_protected(&pipeline->base.base)) {
1508       anv_pipeline_emit_merge(pipeline, partial.gs_protected,
1509                               gs_dwords, GENX(3DSTATE_GS), gs) {
1510 #if GFX_VERx10 >= 125
1511          gs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1512                                                   MESA_SHADER_GEOMETRY,
1513                                                   gs_bin, true);
1514 #endif
1515       }
1516    }
1517 }
1518 
1519 static void
emit_3dstate_wm(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_rasterization_state * rs,const struct vk_multisample_state * ms,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)1520 emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
1521                 const struct vk_input_assembly_state *ia,
1522                 const struct vk_rasterization_state *rs,
1523                 const struct vk_multisample_state *ms,
1524                 const struct vk_color_blend_state *cb,
1525                 const struct vk_render_pass_state *rp)
1526 {
1527    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1528 
1529    anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
1530       wm.StatisticsEnable                    = true;
1531       wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
1532       wm.LineAntialiasingRegionWidth         = _10pixels;
1533       wm.PointRasterizationRule              = RASTRULE_UPPER_LEFT;
1534 
1535       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1536          if (wm_prog_data->early_fragment_tests) {
1537             wm.EarlyDepthStencilControl         = EDSC_PREPS;
1538          } else if (wm_prog_data->has_side_effects) {
1539             wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
1540          } else {
1541             wm.EarlyDepthStencilControl         = EDSC_NORMAL;
1542          }
1543       }
1544    }
1545 }
1546 
1547 static void
emit_3dstate_ps(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms,const struct vk_color_blend_state * cb)1548 emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
1549                 const struct vk_multisample_state *ms,
1550                 const struct vk_color_blend_state *cb)
1551 {
1552    UNUSED const struct intel_device_info *devinfo =
1553       pipeline->base.base.device->info;
1554    const struct anv_shader_bin *fs_bin =
1555       pipeline->base.shaders[MESA_SHADER_FRAGMENT];
1556 
1557    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1558       anv_pipeline_emit(pipeline, partial.ps, GENX(3DSTATE_PS), ps);
1559       anv_pipeline_emit(pipeline, partial.ps_protected, GENX(3DSTATE_PS), ps);
1560       return;
1561    }
1562 
1563    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1564 
1565    uint32_t ps_dwords[GENX(3DSTATE_PS_length)];
1566    anv_pipeline_emit_tmp(pipeline, ps_dwords, GENX(3DSTATE_PS), ps) {
1567 #if GFX_VER == 12
1568       assert(wm_prog_data->dispatch_multi == 0 ||
1569              (wm_prog_data->dispatch_multi == 16 && wm_prog_data->max_polygons == 2));
1570       ps.DualSIMD8DispatchEnable = wm_prog_data->dispatch_multi;
1571       /* XXX - No major improvement observed from enabling
1572        *       overlapping subspans, but it could be helpful
1573        *       in theory when the requirements listed on the
1574        *       BSpec page for 3DSTATE_PS_BODY are met.
1575        */
1576       ps.OverlappingSubspansEnable = false;
1577 #endif
1578 
1579       ps.SingleProgramFlow          = false;
1580       ps.VectorMaskEnable           = wm_prog_data->uses_vmask;
1581       /* Wa_1606682166 */
1582       ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
1583       ps.BindingTableEntryCount     = fs_bin->bind_map.surface_count;
1584 #if GFX_VER < 20
1585       ps.PushConstantEnable         =
1586          devinfo->needs_null_push_constant_tbimr_workaround ||
1587          wm_prog_data->base.nr_params > 0 ||
1588          wm_prog_data->base.ubo_ranges[0].length;
1589 #endif
1590 
1591       ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
1592 
1593 #if GFX_VERx10 < 125
1594       ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
1595       ps.ScratchSpaceBasePointer =
1596          get_scratch_address(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
1597 #endif
1598 
1599 #if GFX_VER >= 30
1600       ps.RegistersPerThread = ptl_register_blocks(wm_prog_data->base.grf_used);
1601 #endif
1602    }
1603    anv_pipeline_emit_merge(pipeline, partial.ps, ps_dwords, GENX(3DSTATE_PS), ps) {
1604 #if GFX_VERx10 >= 125
1605       ps.ScratchSpaceBuffer =
1606          get_scratch_surf(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin, false);
1607 #endif
1608    }
1609    if (pipeline_needs_protected(&pipeline->base.base)) {
1610       anv_pipeline_emit_merge(pipeline, partial.ps_protected,
1611                               ps_dwords, GENX(3DSTATE_PS), ps) {
1612 #if GFX_VERx10 >= 125
1613          ps.ScratchSpaceBuffer =
1614             get_scratch_surf(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin, true);
1615 #endif
1616       }
1617    }
1618 }
1619 
1620 static void
emit_3dstate_ps_extra(struct anv_graphics_pipeline * pipeline,const struct vk_rasterization_state * rs,const struct vk_graphics_pipeline_state * state)1621 emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
1622                       const struct vk_rasterization_state *rs,
1623                       const struct vk_graphics_pipeline_state *state)
1624 {
1625    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1626 
1627    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1628       anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
1629       return;
1630    }
1631 
1632    anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
1633       ps.PixelShaderValid              = true;
1634 #if GFX_VER < 20
1635       ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
1636 #endif
1637       ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
1638       ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
1639       ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
1640       ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
1641 
1642       ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
1643 #if GFX_VER >= 20
1644       assert(!wm_prog_data->pulls_bary);
1645 #else
1646       ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
1647 #endif
1648 
1649       ps.InputCoverageMaskState = ICMS_NONE;
1650       assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
1651       if (!wm_prog_data->uses_sample_mask)
1652          ps.InputCoverageMaskState = ICMS_NONE;
1653       else if (brw_wm_prog_data_is_coarse(wm_prog_data, 0))
1654          ps.InputCoverageMaskState  = ICMS_NORMAL;
1655       else if (wm_prog_data->post_depth_coverage)
1656          ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
1657       else
1658          ps.InputCoverageMaskState = ICMS_NORMAL;
1659 
1660 #if GFX_VER >= 11
1661       ps.PixelShaderRequiresSubpixelSampleOffsets =
1662          wm_prog_data->uses_sample_offsets;
1663       ps.PixelShaderRequiresNonPerspectiveBaryPlaneCoefficients =
1664          wm_prog_data->uses_npc_bary_coefficients;
1665       ps.PixelShaderRequiresPerspectiveBaryPlaneCoefficients =
1666          wm_prog_data->uses_pc_bary_coefficients;
1667       ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
1668          wm_prog_data->uses_depth_w_coefficients;
1669 #endif
1670    }
1671 }
1672 
1673 static void
compute_kill_pixel(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms,const struct vk_graphics_pipeline_state * state)1674 compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
1675                    const struct vk_multisample_state *ms,
1676                    const struct vk_graphics_pipeline_state *state)
1677 {
1678    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1679       pipeline->kill_pixel = false;
1680       return;
1681    }
1682 
1683    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1684 
1685    /* This computes the KillPixel portion of the computation for whether or
1686     * not we want to enable the PMA fix on gfx8 or gfx9.  It's given by this
1687     * chunk of the giant formula:
1688     *
1689     *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1690     *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1691     *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1692     *     3DSTATE_PS_BLEND::AlphaTestEnable ||
1693     *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1694     *
1695     * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
1696     * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
1697     * of an alpha test.
1698     */
1699    pipeline->rp_has_ds_self_dep =
1700       (state->pipeline_flags &
1701        VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) != 0;
1702    pipeline->kill_pixel =
1703       pipeline->rp_has_ds_self_dep ||
1704       wm_prog_data->uses_kill ||
1705       wm_prog_data->uses_omask ||
1706       (ms && ms->alpha_to_coverage_enable);
1707 }
1708 
1709 #if GFX_VER >= 12
1710 static void
emit_3dstate_primitive_replication(struct anv_graphics_pipeline * pipeline,const struct vk_render_pass_state * rp)1711 emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
1712                                    const struct vk_render_pass_state *rp)
1713 {
1714    if (anv_pipeline_is_mesh(pipeline)) {
1715       anv_pipeline_emit(pipeline, final.primitive_replication,
1716                         GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1717       return;
1718    }
1719 
1720    const int replication_count =
1721       anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots;
1722 
1723    assert(replication_count >= 1);
1724    if (replication_count == 1) {
1725       anv_pipeline_emit(pipeline, final.primitive_replication,
1726                         GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1727       return;
1728    }
1729 
1730    assert(replication_count == util_bitcount(rp->view_mask));
1731    assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
1732 
1733    anv_pipeline_emit(pipeline, final.primitive_replication,
1734                      GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
1735       pr.ReplicaMask = (1 << replication_count) - 1;
1736       pr.ReplicationCount = replication_count - 1;
1737 
1738       int i = 0;
1739       u_foreach_bit(view_index, rp->view_mask) {
1740          pr.RTAIOffset[i] = view_index;
1741          i++;
1742       }
1743    }
1744 }
1745 #endif
1746 
1747 #if GFX_VERx10 >= 125
1748 static void
emit_task_state(struct anv_graphics_pipeline * pipeline)1749 emit_task_state(struct anv_graphics_pipeline *pipeline)
1750 {
1751    assert(anv_pipeline_is_mesh(pipeline));
1752 
1753    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
1754       anv_pipeline_emit(pipeline, final.task_control,
1755                         GENX(3DSTATE_TASK_CONTROL), zero);
1756       anv_pipeline_emit(pipeline, final.task_control_protected,
1757                         GENX(3DSTATE_TASK_CONTROL), zero);
1758       anv_pipeline_emit(pipeline, final.task_shader,
1759                         GENX(3DSTATE_TASK_SHADER), zero);
1760       anv_pipeline_emit(pipeline, final.task_redistrib,
1761                         GENX(3DSTATE_TASK_REDISTRIB), zero);
1762       return;
1763    }
1764 
1765    const struct anv_shader_bin *task_bin =
1766       pipeline->base.shaders[MESA_SHADER_TASK];
1767 
1768    uint32_t task_control_dwords[GENX(3DSTATE_TASK_CONTROL_length)];
1769    anv_pipeline_emit_tmp(pipeline, task_control_dwords, GENX(3DSTATE_TASK_CONTROL), tc) {
1770       tc.TaskShaderEnable = true;
1771       tc.StatisticsEnable = true;
1772       tc.MaximumNumberofThreadGroups = 511;
1773    }
1774 
1775    anv_pipeline_emit_merge(pipeline, final.task_control,
1776                            task_control_dwords, GENX(3DSTATE_TASK_CONTROL), tc) {
1777       tc.ScratchSpaceBuffer =
1778          get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin, false);
1779    }
1780    if (pipeline_needs_protected(&pipeline->base.base)) {
1781       anv_pipeline_emit_merge(pipeline, final.task_control_protected,
1782                               task_control_dwords, GENX(3DSTATE_TASK_CONTROL), tc) {
1783          tc.ScratchSpaceBuffer =
1784             get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin, true);
1785       }
1786    }
1787 
1788    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1789    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
1790    const struct intel_cs_dispatch_info task_dispatch =
1791       brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
1792 
1793    anv_pipeline_emit(pipeline, final.task_shader,
1794                      GENX(3DSTATE_TASK_SHADER), task) {
1795       task.KernelStartPointer                = task_bin->kernel.offset;
1796       task.SIMDSize                          = task_dispatch.simd_size / 16;
1797       task.MessageSIMD                       = task.SIMDSize;
1798       task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads;
1799       task.ExecutionMask                     = task_dispatch.right_mask;
1800       task.LocalXMaximum                     = task_dispatch.group_size - 1;
1801       task.EmitLocalIDX                      = true;
1802 
1803       task.NumberofBarriers                  = task_prog_data->base.uses_barrier;
1804       task.SharedLocalMemorySize             =
1805          intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared);
1806       task.PreferredSLMAllocationSize        =
1807          intel_compute_preferred_slm_calc_encode_size(devinfo,
1808                                                       task_prog_data->base.base.total_shared,
1809                                                       task_dispatch.group_size,
1810                                                       task_dispatch.simd_size);
1811 
1812       /*
1813        * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
1814        * of a buffer with push constants and descriptor set table and
1815        * InlineData[2:7] will be used for first few push constants.
1816        */
1817       task.EmitInlineParameter = true;
1818 
1819       task.XP0Required = task_prog_data->uses_drawid;
1820 
1821 #if GFX_VER >= 30
1822       task.RegistersPerThread = ptl_register_blocks(task_prog_data->base.base.grf_used);
1823 #endif
1824    }
1825 
1826    /* Recommended values from "Task and Mesh Distribution Programming". */
1827    anv_pipeline_emit(pipeline, final.task_redistrib,
1828                      GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
1829       redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
1830       redistrib.SmallTaskThreshold = 1; /* 2^N */
1831       redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
1832       redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM;
1833       redistrib.TaskRedistributionMode = TASKREDISTRIB_RR_STRICT;
1834    }
1835 }
1836 
1837 static void
emit_mesh_state(struct anv_graphics_pipeline * pipeline)1838 emit_mesh_state(struct anv_graphics_pipeline *pipeline)
1839 {
1840    assert(anv_pipeline_is_mesh(pipeline));
1841 
1842    const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];
1843    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
1844 
1845    uint32_t mesh_control_dwords[GENX(3DSTATE_MESH_CONTROL_length)];
1846    anv_pipeline_emit_tmp(pipeline, mesh_control_dwords, GENX(3DSTATE_MESH_CONTROL), mc) {
1847       mc.MeshShaderEnable = true;
1848       mc.StatisticsEnable = true;
1849       mc.MaximumNumberofThreadGroups = 511;
1850 #if GFX_VER >= 20
1851       mc.VPandRTAIndexAutostripEnable = mesh_prog_data->autostrip_enable;
1852 #endif
1853    }
1854 
1855    anv_pipeline_emit_merge(pipeline, final.mesh_control,
1856                            mesh_control_dwords, GENX(3DSTATE_MESH_CONTROL), mc) {
1857       mc.ScratchSpaceBuffer =
1858          get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin, false);
1859    }
1860    if (pipeline_needs_protected(&pipeline->base.base)) {
1861       anv_pipeline_emit_merge(pipeline, final.mesh_control_protected,
1862                            mesh_control_dwords, GENX(3DSTATE_MESH_CONTROL), mc) {
1863          mc.ScratchSpaceBuffer =
1864             get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin, true);
1865       }
1866    }
1867 
1868    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1869    const struct intel_cs_dispatch_info mesh_dispatch =
1870       brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL);
1871 
1872    const unsigned output_topology =
1873       mesh_prog_data->primitive_type == MESA_PRIM_POINTS ? OUTPUT_POINT :
1874       mesh_prog_data->primitive_type == MESA_PRIM_LINES  ? OUTPUT_LINE :
1875                                                              OUTPUT_TRI;
1876 
1877    uint32_t index_format;
1878    switch (mesh_prog_data->index_format) {
1879    case BRW_INDEX_FORMAT_U32:
1880       index_format = INDEX_U32;
1881       break;
1882    case BRW_INDEX_FORMAT_U888X:
1883       index_format = INDEX_U888X;
1884       break;
1885    default:
1886       unreachable("invalid index format");
1887    }
1888 
1889    anv_pipeline_emit(pipeline, final.mesh_shader,
1890                      GENX(3DSTATE_MESH_SHADER), mesh) {
1891       mesh.KernelStartPointer                = mesh_bin->kernel.offset;
1892       mesh.SIMDSize                          = mesh_dispatch.simd_size / 16;
1893       mesh.MessageSIMD                       = mesh.SIMDSize;
1894       mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads;
1895       mesh.ExecutionMask                     = mesh_dispatch.right_mask;
1896       mesh.LocalXMaximum                     = mesh_dispatch.group_size - 1;
1897       mesh.EmitLocalIDX                      = true;
1898 
1899       mesh.MaximumPrimitiveCount             = MAX2(mesh_prog_data->map.max_primitives, 1) - 1;
1900       mesh.OutputTopology                    = output_topology;
1901       mesh.PerVertexDataPitch                = mesh_prog_data->map.per_vertex_pitch_dw / 8;
1902       mesh.PerPrimitiveDataPresent           = mesh_prog_data->map.per_primitive_pitch_dw > 0;
1903       mesh.PerPrimitiveDataPitch             = mesh_prog_data->map.per_primitive_pitch_dw / 8;
1904       mesh.IndexFormat                       = index_format;
1905 
1906       mesh.NumberofBarriers                  = mesh_prog_data->base.uses_barrier;
1907       mesh.SharedLocalMemorySize             =
1908          intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared);
1909       mesh.PreferredSLMAllocationSize        =
1910          intel_compute_preferred_slm_calc_encode_size(devinfo,
1911                                                       mesh_prog_data->base.base.total_shared,
1912                                                       mesh_dispatch.group_size,
1913                                                       mesh_dispatch.simd_size);
1914 
1915       /*
1916        * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address
1917        * of a buffer with push constants and descriptor set table and
1918        * InlineData[2:7] will be used for first few push constants.
1919        */
1920       mesh.EmitInlineParameter = true;
1921 
1922       mesh.XP0Required = mesh_prog_data->uses_drawid;
1923 
1924 #if GFX_VER >= 30
1925       mesh.RegistersPerThread = ptl_register_blocks(mesh_prog_data->base.base.grf_used);
1926 #endif
1927    }
1928 
1929    /* Recommended values from "Task and Mesh Distribution Programming". */
1930    anv_pipeline_emit(pipeline, final.mesh_distrib,
1931                      GENX(3DSTATE_MESH_DISTRIB), distrib) {
1932       distrib.DistributionMode = MESH_RR_FREE;
1933       distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
1934       distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
1935    }
1936 }
1937 #endif
1938 
1939 void
genX(graphics_pipeline_emit)1940 genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
1941                              const struct vk_graphics_pipeline_state *state)
1942 {
1943    enum intel_urb_deref_block_size urb_deref_block_size;
1944    emit_urb_setup(pipeline, &urb_deref_block_size);
1945 
1946    emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp,
1947                  urb_deref_block_size);
1948    compute_kill_pixel(pipeline, state->ms, state);
1949 
1950    emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs);
1951 
1952 #if GFX_VER >= 12
1953    emit_3dstate_primitive_replication(pipeline, state->rp);
1954 #endif
1955 
1956 #if GFX_VERx10 >= 125
1957    bool needs_instance_granularity =
1958       intel_needs_workaround(pipeline->base.base.device->info, 14019166699) &&
1959       (sbe_primitive_id_override(pipeline) ||
1960        geom_or_tess_prim_id_used(pipeline));
1961 
1962    anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
1963       /* Gfx12.5: If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE */
1964       vfg.DistributionMode =
1965 #if GFX_VER < 20
1966          !anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_FREE :
1967 #endif
1968          RR_STRICT;
1969       vfg.DistributionGranularity = needs_instance_granularity ?
1970          InstanceLevelGranularity : BatchLevelGranularity;
1971 #if INTEL_WA_14014851047_GFX_VER
1972       vfg.GranularityThresholdDisable =
1973          intel_needs_workaround(pipeline->base.base.device->info, 14014851047);
1974 #endif
1975       /* 192 vertices for TRILIST_ADJ */
1976       vfg.ListNBatchSizeScale = 0;
1977       /* Batch size of 384 vertices */
1978       vfg.List3BatchSizeScale = 2;
1979       /* Batch size of 128 vertices */
1980       vfg.List2BatchSizeScale = 1;
1981       /* Batch size of 128 vertices */
1982       vfg.List1BatchSizeScale = 2;
1983       /* Batch size of 256 vertices for STRIP topologies */
1984       vfg.StripBatchSizeScale = 3;
1985       /* 192 control points for PATCHLIST_3 */
1986       vfg.PatchBatchSizeScale = 1;
1987       /* 192 control points for PATCHLIST_3 */
1988       vfg.PatchBatchSizeMultiplier = 31;
1989    }
1990 #endif
1991 
1992    if (anv_pipeline_is_primitive(pipeline)) {
1993       emit_vertex_input(pipeline, state, state->vi);
1994 
1995       emit_3dstate_vs(pipeline);
1996       emit_3dstate_hs_ds(pipeline, state->ts);
1997       emit_3dstate_te(pipeline);
1998       emit_3dstate_gs(pipeline);
1999 
2000       emit_3dstate_streamout(pipeline, state->rs);
2001 
2002 #if GFX_VERx10 >= 125
2003       const struct anv_device *device = pipeline->base.base.device;
2004       /* Disable Mesh. */
2005       if (device->vk.enabled_extensions.EXT_mesh_shader) {
2006          anv_pipeline_emit(pipeline, final.mesh_control,
2007                            GENX(3DSTATE_MESH_CONTROL), zero);
2008          anv_pipeline_emit(pipeline, final.mesh_control_protected,
2009                            GENX(3DSTATE_MESH_CONTROL), zero);
2010          anv_pipeline_emit(pipeline, final.mesh_shader,
2011                            GENX(3DSTATE_MESH_SHADER), zero);
2012          anv_pipeline_emit(pipeline, final.mesh_distrib,
2013                            GENX(3DSTATE_MESH_DISTRIB), zero);
2014          anv_pipeline_emit(pipeline, final.clip_mesh,
2015                            GENX(3DSTATE_CLIP_MESH), zero);
2016          anv_pipeline_emit(pipeline, final.sbe_mesh,
2017                            GENX(3DSTATE_SBE_MESH), zero);
2018          anv_pipeline_emit(pipeline, final.task_control,
2019                            GENX(3DSTATE_TASK_CONTROL), zero);
2020          anv_pipeline_emit(pipeline, final.task_control_protected,
2021                            GENX(3DSTATE_TASK_CONTROL), zero);
2022          anv_pipeline_emit(pipeline, final.task_shader,
2023                            GENX(3DSTATE_TASK_SHADER), zero);
2024          anv_pipeline_emit(pipeline, final.task_redistrib,
2025                            GENX(3DSTATE_TASK_REDISTRIB), zero);
2026       }
2027 #endif
2028    } else {
2029       assert(anv_pipeline_is_mesh(pipeline));
2030 
2031       anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs);
2032 #if GFX_VER >= 11
2033       anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs);
2034 #endif
2035       anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs);
2036       anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
2037       anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
2038       anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te);
2039       anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
2040 
2041       anv_pipeline_emit(pipeline, final.vs_protected, GENX(3DSTATE_VS), vs);
2042       anv_pipeline_emit(pipeline, final.hs_protected, GENX(3DSTATE_HS), hs);
2043       anv_pipeline_emit(pipeline, final.ds_protected, GENX(3DSTATE_DS), ds);
2044       anv_pipeline_emit(pipeline, partial.gs_protected, GENX(3DSTATE_GS), gs);
2045 
2046       /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
2047        * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
2048        */
2049       anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);
2050 
2051 #if GFX_VERx10 >= 125
2052       emit_task_state(pipeline);
2053       emit_mesh_state(pipeline);
2054 #endif
2055    }
2056 
2057    emit_3dstate_sbe(pipeline);
2058    emit_3dstate_wm(pipeline, state->ia, state->rs,
2059                    state->ms, state->cb, state->rp);
2060    emit_3dstate_ps(pipeline, state->ms, state->cb);
2061    emit_3dstate_ps_extra(pipeline, state->rs, state);
2062 }
2063 
2064 #if GFX_VERx10 >= 125
2065 
2066 void
genX(compute_pipeline_emit)2067 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
2068 {
2069    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2070    anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2071 }
2072 
2073 #else /* #if GFX_VERx10 >= 125 */
2074 
2075 void
genX(compute_pipeline_emit)2076 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
2077 {
2078    struct anv_device *device = pipeline->base.device;
2079    const struct intel_device_info *devinfo = device->info;
2080    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2081 
2082    anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2083 
2084    const struct intel_cs_dispatch_info dispatch =
2085       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
2086    const uint32_t vfe_curbe_allocation =
2087       ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
2088             cs_prog_data->push.cross_thread.regs, 2);
2089 
2090    const struct anv_shader_bin *cs_bin = pipeline->cs;
2091 
2092    anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
2093       vfe.StackSize              = 0;
2094       vfe.MaximumNumberofThreads =
2095          devinfo->max_cs_threads * devinfo->subslice_total - 1;
2096       vfe.NumberofURBEntries     = 2;
2097 #if GFX_VER < 11
2098       vfe.ResetGatewayTimer      = true;
2099 #endif
2100       vfe.URBEntryAllocationSize = 2;
2101       vfe.CURBEAllocationSize    = vfe_curbe_allocation;
2102 
2103       if (cs_prog_data->base.total_scratch) {
2104          /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
2105           * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
2106           */
2107          vfe.PerThreadScratchSpace = ffs(cs_prog_data->base.total_scratch) - 11;
2108          vfe.ScratchSpaceBasePointer =
2109             get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2110       }
2111    }
2112 
2113    struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
2114       .KernelStartPointer     =
2115          cs_bin->kernel.offset +
2116          brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
2117 
2118       /* Wa_1606682166 */
2119       .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
2120       /* We add 1 because the CS indirect parameters buffer isn't accounted
2121        * for in bind_map.surface_count.
2122        *
2123        * Typically set to 0 to avoid prefetching on every thread dispatch.
2124        */
2125       .BindingTableEntryCount = devinfo->verx10 == 125 ?
2126          0 : MIN2(pipeline->cs->bind_map.surface_count, 30),
2127       .BarrierEnable          = cs_prog_data->uses_barrier,
2128       .SharedLocalMemorySize  =
2129          intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared),
2130 
2131       .ConstantURBEntryReadOffset = 0,
2132       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2133       .CrossThreadConstantDataReadLength =
2134          cs_prog_data->push.cross_thread.regs,
2135 #if GFX_VER >= 12
2136       /* TODO: Check if we are missing workarounds and enable mid-thread
2137        * preemption.
2138        *
2139        * We still have issues with mid-thread preemption (it was already
2140        * disabled by the kernel on gfx11, due to missing workarounds). It's
2141        * possible that we are just missing some workarounds, and could enable
2142        * it later, but for now let's disable it to fix a GPU in compute in Car
2143        * Chase (and possibly more).
2144        */
2145       .ThreadPreemptionDisable = true,
2146 #endif
2147 
2148       .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2149    };
2150    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
2151                                         pipeline->interface_descriptor_data,
2152                                         &desc);
2153 }
2154 
2155 #endif /* #if GFX_VERx10 >= 125 */
2156 
2157 #if GFX_VERx10 >= 125
2158 
2159 void
genX(ray_tracing_pipeline_emit)2160 genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
2161 {
2162    for (uint32_t i = 0; i < pipeline->group_count; i++) {
2163       struct anv_rt_shader_group *group = &pipeline->groups[i];
2164 
2165       switch (group->type) {
2166       case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
2167          struct GENX(RT_GENERAL_SBT_HANDLE) sh = {};
2168          sh.General = anv_shader_bin_get_bsr(group->general, 32);
2169          GENX(RT_GENERAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2170          break;
2171       }
2172 
2173       case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
2174          struct GENX(RT_TRIANGLES_SBT_HANDLE) sh = {};
2175          if (group->closest_hit)
2176             sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2177          if (group->any_hit)
2178             sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
2179          GENX(RT_TRIANGLES_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2180          break;
2181       }
2182 
2183       case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
2184          struct GENX(RT_PROCEDURAL_SBT_HANDLE) sh = {};
2185          if (group->closest_hit)
2186             sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2187          sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
2188          GENX(RT_PROCEDURAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2189          break;
2190       }
2191 
2192       default:
2193          unreachable("Invalid shader group type");
2194       }
2195    }
2196 }
2197 
2198 #else
2199 
2200 void
genX(ray_tracing_pipeline_emit)2201 genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
2202 {
2203    unreachable("Ray tracing not supported");
2204 }
2205 
2206 #endif /* GFX_VERx10 >= 125 */
2207