• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28 #include "genxml/genX_rt_pack.h"
29 
30 #include "common/intel_genX_state_brw.h"
31 #include "common/intel_l3_config.h"
32 #include "common/intel_sample_positions.h"
33 #include "nir/nir_xfb_info.h"
34 #include "vk_util.h"
35 #include "vk_format.h"
36 #include "vk_log.h"
37 #include "vk_render_pass.h"
38 
39 static inline struct anv_batch *
anv_gfx_pipeline_add(struct anv_graphics_pipeline * pipeline,struct anv_gfx_state_ptr * ptr,uint32_t n_dwords)40 anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
41                      struct anv_gfx_state_ptr *ptr,
42                      uint32_t n_dwords)
43 {
44    struct anv_batch *batch = &pipeline->base.base.batch;
45 
46    assert(ptr->len == 0 ||
47           (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
48    if (ptr->len == 0)
49       ptr->offset = (batch->next - batch->start) / 4;
50    ptr->len += n_dwords;
51 
52    return batch;
53 }
54 
55 #define anv_pipeline_emit(pipeline, state, cmd, name)                   \
56    for (struct cmd name = { __anv_cmd_header(cmd) },                    \
57            *_dst = anv_batch_emit_dwords(                               \
58               anv_gfx_pipeline_add(pipeline,                            \
59                                    &(pipeline)->state,                  \
60                                    __anv_cmd_length(cmd)),              \
61               __anv_cmd_length(cmd));                                   \
62         __builtin_expect(_dst != NULL, 1);                              \
63         ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch,            \
64                                _dst, &name);                            \
65            VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
66            _dst = NULL;                                                 \
67         }))
68 
69 #define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({             \
70    void *__dst = anv_batch_emit_dwords(                                 \
71       anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n);        \
72    if (__dst) {                                                         \
73       struct cmd __template = {                                         \
74          __anv_cmd_header(cmd),                                         \
75          .DWordLength = n - __anv_cmd_length_bias(cmd),                 \
76          __VA_ARGS__                                                    \
77       };                                                                \
78       __anv_cmd_pack(cmd)(&pipeline->base.base.batch,                   \
79                           __dst, &__template);                          \
80    }                                                                    \
81    __dst;                                                               \
82    })
83 
84 
85 static uint32_t
vertex_element_comp_control(enum isl_format format,unsigned comp)86 vertex_element_comp_control(enum isl_format format, unsigned comp)
87 {
88    uint8_t bits;
89    switch (comp) {
90    case 0: bits = isl_format_layouts[format].channels.r.bits; break;
91    case 1: bits = isl_format_layouts[format].channels.g.bits; break;
92    case 2: bits = isl_format_layouts[format].channels.b.bits; break;
93    case 3: bits = isl_format_layouts[format].channels.a.bits; break;
94    default: unreachable("Invalid component");
95    }
96 
97    /*
98     * Take in account hardware restrictions when dealing with 64-bit floats.
99     *
100     * From Broadwell spec, command reference structures, page 586:
101     *  "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
102     *   64-bit components are stored * in the URB without any conversion. In
103     *   this case, vertex elements must be written as 128 or 256 bits, with
104     *   VFCOMP_STORE_0 being used to pad the output as required. E.g., if
105     *   R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
106     *   Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
107     *   set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
108     *   Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
109     *   a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
110     *   Component 3 to be specified as VFCOMP_STORE_0 in order to output a
111     *   256-bit vertex element."
112     */
113    if (bits) {
114       return VFCOMP_STORE_SRC;
115    } else if (comp >= 2 &&
116               !isl_format_layouts[format].channels.b.bits &&
117               isl_format_layouts[format].channels.r.type == ISL_RAW) {
118       /* When emitting 64-bit attributes, we need to write either 128 or 256
119        * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
120        * VFCOMP_STORE_0 to pad the written chunk */
121       return VFCOMP_NOSTORE;
122    } else if (comp < 3 ||
123               isl_format_layouts[format].channels.r.type == ISL_RAW) {
124       /* Note we need to pad with value 0, not 1, due hardware restrictions
125        * (see comment above) */
126       return VFCOMP_STORE_0;
127    } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
128             isl_format_layouts[format].channels.r.type == ISL_SINT) {
129       assert(comp == 3);
130       return VFCOMP_STORE_1_INT;
131    } else {
132       assert(comp == 3);
133       return VFCOMP_STORE_1_FP;
134    }
135 }
136 
137 void
genX(emit_vertex_input)138 genX(emit_vertex_input)(struct anv_batch *batch,
139                         uint32_t *vertex_element_dws,
140                         struct anv_graphics_pipeline *pipeline,
141                         const struct vk_vertex_input_state *vi,
142                         bool emit_in_pipeline)
143 {
144    const struct anv_device *device = pipeline->base.base.device;
145    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
146    const uint64_t inputs_read = vs_prog_data->inputs_read;
147    const uint64_t double_inputs_read =
148       vs_prog_data->double_inputs_read & inputs_read;
149    assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
150    const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
151    const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
152 
153    for (uint32_t i = 0; i < pipeline->vs_input_elements; i++) {
154       /* The SKL docs for VERTEX_ELEMENT_STATE say:
155        *
156        *    "All elements must be valid from Element[0] to the last valid
157        *    element. (I.e. if Element[2] is valid then Element[1] and
158        *    Element[0] must also be valid)."
159        *
160        * The SKL docs for 3D_Vertex_Component_Control say:
161        *
162        *    "Don't store this component. (Not valid for Component 0, but can
163        *    be used for Component 1-3)."
164        *
165        * So we can't just leave a vertex element blank and hope for the best.
166        * We have to tell the VF hardware to put something in it; so we just
167        * store a bunch of zero.
168        *
169        * TODO: Compact vertex elements so we never end up with holes.
170        */
171       struct GENX(VERTEX_ELEMENT_STATE) element = {
172          .Valid = true,
173          .Component0Control = VFCOMP_STORE_0,
174          .Component1Control = VFCOMP_STORE_0,
175          .Component2Control = VFCOMP_STORE_0,
176          .Component3Control = VFCOMP_STORE_0,
177       };
178       GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
179                                       &vertex_element_dws[i * 2],
180                                       &element);
181    }
182 
183    u_foreach_bit(a, vi->attributes_valid) {
184       enum isl_format format = anv_get_isl_format(device->info,
185                                                   vi->attributes[a].format,
186                                                   VK_IMAGE_ASPECT_COLOR_BIT,
187                                                   VK_IMAGE_TILING_LINEAR);
188       assume(format < ISL_NUM_FORMATS);
189 
190       uint32_t binding = vi->attributes[a].binding;
191       assert(binding < MAX_VBS);
192 
193       if ((elements & (1 << a)) == 0)
194          continue; /* Binding unused */
195 
196       uint32_t slot =
197          __builtin_popcount(elements & ((1 << a) - 1)) -
198          DIV_ROUND_UP(__builtin_popcount(elements_double &
199                                         ((1 << a) -1)), 2);
200 
201       struct GENX(VERTEX_ELEMENT_STATE) element = {
202          .VertexBufferIndex = vi->attributes[a].binding,
203          .Valid = true,
204          .SourceElementFormat = format,
205          .EdgeFlagEnable = false,
206          .SourceElementOffset = vi->attributes[a].offset,
207          .Component0Control = vertex_element_comp_control(format, 0),
208          .Component1Control = vertex_element_comp_control(format, 1),
209          .Component2Control = vertex_element_comp_control(format, 2),
210          .Component3Control = vertex_element_comp_control(format, 3),
211       };
212       GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
213                                       &vertex_element_dws[slot * 2],
214                                       &element);
215 
216       /* On Broadwell and later, we have a separate VF_INSTANCING packet
217        * that controls instancing.  On Haswell and prior, that's part of
218        * VERTEX_BUFFER_STATE which we emit later.
219        */
220       if (emit_in_pipeline) {
221          anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
222             bool per_instance = vi->bindings[binding].input_rate ==
223                VK_VERTEX_INPUT_RATE_INSTANCE;
224             uint32_t divisor = vi->bindings[binding].divisor *
225                pipeline->instance_multiplier;
226 
227             vfi.InstancingEnable = per_instance;
228             vfi.VertexElementIndex = slot;
229             vfi.InstanceDataStepRate = per_instance ? divisor : 1;
230          }
231       } else {
232          anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
233             bool per_instance = vi->bindings[binding].input_rate ==
234                VK_VERTEX_INPUT_RATE_INSTANCE;
235             uint32_t divisor = vi->bindings[binding].divisor *
236                pipeline->instance_multiplier;
237 
238             vfi.InstancingEnable = per_instance;
239             vfi.VertexElementIndex = slot;
240             vfi.InstanceDataStepRate = per_instance ? divisor : 1;
241          }
242       }
243    }
244 }
245 
246 static void
emit_vertex_input(struct anv_graphics_pipeline * pipeline,const struct vk_graphics_pipeline_state * state,const struct vk_vertex_input_state * vi)247 emit_vertex_input(struct anv_graphics_pipeline *pipeline,
248                   const struct vk_graphics_pipeline_state *state,
249                   const struct vk_vertex_input_state *vi)
250 {
251    /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
252     * everything in gfx8_cmd_buffer.c
253     */
254    if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
255       genX(emit_vertex_input)(NULL,
256                               pipeline->vertex_input_data,
257                               pipeline, vi, true /* emit_in_pipeline */);
258    }
259 
260    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
261    const bool needs_svgs_elem = pipeline->svgs_count > 1 ||
262                                 !vs_prog_data->uses_drawid;
263    const uint32_t id_slot = pipeline->vs_input_elements;
264    const uint32_t drawid_slot = id_slot + needs_svgs_elem;
265    if (pipeline->svgs_count > 0) {
266       assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
267       uint32_t slot_offset =
268          pipeline->vertex_input_elems - pipeline->svgs_count;
269 
270       if (needs_svgs_elem) {
271 #if GFX_VER < 11
272          /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
273           *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
274           *    Control field is set to something other than VFCOMP_STORE_SRC,
275           *    no higher-numbered Component Control fields may be set to
276           *    VFCOMP_STORE_SRC"
277           *
278           * This means, that if we have BaseInstance, we need BaseVertex as
279           * well.  Just do all or nothing.
280           */
281          uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
282                                vs_prog_data->uses_baseinstance) ?
283                               VFCOMP_STORE_SRC : VFCOMP_STORE_0;
284 #endif
285 
286          struct GENX(VERTEX_ELEMENT_STATE) element = {
287             .VertexBufferIndex = ANV_SVGS_VB_INDEX,
288             .Valid = true,
289             .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
290 #if GFX_VER >= 11
291             /* On gen11, these are taken care of by extra parameter slots */
292             .Component0Control = VFCOMP_STORE_0,
293             .Component1Control = VFCOMP_STORE_0,
294 #else
295             .Component0Control = base_ctrl,
296             .Component1Control = base_ctrl,
297 #endif
298             .Component2Control = VFCOMP_STORE_0,
299             .Component3Control = VFCOMP_STORE_0,
300          };
301          GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
302                                          &pipeline->vertex_input_data[slot_offset * 2],
303                                          &element);
304          slot_offset++;
305 
306          anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
307                            GENX(3DSTATE_VF_INSTANCING), vfi) {
308             vfi.VertexElementIndex = id_slot;
309          }
310       }
311 
312       if (vs_prog_data->uses_drawid) {
313          struct GENX(VERTEX_ELEMENT_STATE) element = {
314             .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
315             .Valid = true,
316             .SourceElementFormat = ISL_FORMAT_R32_UINT,
317 #if GFX_VER >= 11
318             /* On gen11, this is taken care of by extra parameter slots */
319             .Component0Control = VFCOMP_STORE_0,
320 #else
321             .Component0Control = VFCOMP_STORE_SRC,
322 #endif
323             .Component1Control = VFCOMP_STORE_0,
324             .Component2Control = VFCOMP_STORE_0,
325             .Component3Control = VFCOMP_STORE_0,
326          };
327          GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
328                                          &pipeline->vertex_input_data[slot_offset * 2],
329                                          &element);
330          slot_offset++;
331 
332          anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
333                            GENX(3DSTATE_VF_INSTANCING), vfi) {
334             vfi.VertexElementIndex = drawid_slot;
335          }
336       }
337    }
338 
339    anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
340       sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
341       sgvs.VertexIDComponentNumber     = 2;
342       sgvs.VertexIDElementOffset       = id_slot;
343       sgvs.InstanceIDEnable            = vs_prog_data->uses_instanceid;
344       sgvs.InstanceIDComponentNumber   = 3;
345       sgvs.InstanceIDElementOffset     = id_slot;
346    }
347 
348 #if GFX_VER >= 11
349    anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
350       /* gl_BaseVertex */
351       sgvs.XP0Enable                   = vs_prog_data->uses_firstvertex;
352       sgvs.XP0SourceSelect             = XP0_PARAMETER;
353       sgvs.XP0ComponentNumber          = 0;
354       sgvs.XP0ElementOffset            = id_slot;
355 
356       /* gl_BaseInstance */
357       sgvs.XP1Enable                   = vs_prog_data->uses_baseinstance;
358       sgvs.XP1SourceSelect             = StartingInstanceLocation;
359       sgvs.XP1ComponentNumber          = 1;
360       sgvs.XP1ElementOffset            = id_slot;
361 
362       /* gl_DrawID */
363       sgvs.XP2Enable                   = vs_prog_data->uses_drawid;
364       sgvs.XP2ComponentNumber          = 0;
365       sgvs.XP2ElementOffset            = drawid_slot;
366    }
367 #endif
368 }
369 
370 void
genX(emit_urb_setup)371 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
372                      const struct intel_l3_config *l3_config,
373                      VkShaderStageFlags active_stages,
374                      const struct intel_urb_config *urb_cfg_in,
375                      struct intel_urb_config *urb_cfg_out,
376                      enum intel_urb_deref_block_size *deref_block_size)
377 {
378    const struct intel_device_info *devinfo = device->info;
379 
380    bool constrained;
381    intel_get_urb_config(devinfo, l3_config,
382                         active_stages &
383                            VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
384                         active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
385                         urb_cfg_out, deref_block_size,
386                         &constrained);
387 
388 #if INTEL_NEEDS_WA_16014912113
389       if (intel_urb_setup_changed(urb_cfg_in, urb_cfg_out,
390           MESA_SHADER_TESS_EVAL) && urb_cfg_in->size[0] != 0) {
391          for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
392             anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
393                urb._3DCommandSubOpcode      += i;
394                urb.VSURBStartingAddress      = urb_cfg_in->start[i];
395                urb.VSURBEntryAllocationSize  = urb_cfg_in->size[i] - 1;
396                urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
397             }
398          }
399          genx_batch_emit_pipe_control(batch, device->info, _3D,
400                                       ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
401       }
402 #endif
403 
404    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
405       anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
406          urb._3DCommandSubOpcode      += i;
407          urb.VSURBStartingAddress      = urb_cfg_out->start[i];
408          urb.VSURBEntryAllocationSize  = urb_cfg_out->size[i] - 1;
409          urb.VSNumberofURBEntries      = urb_cfg_out->entries[i];
410       }
411    }
412 #if GFX_VERx10 >= 125
413    if (device->vk.enabled_extensions.EXT_mesh_shader) {
414       anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
415       anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
416    }
417 #endif
418 }
419 
420 #if GFX_VERx10 >= 125
421 static void
emit_urb_setup_mesh(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)422 emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
423                     enum intel_urb_deref_block_size *deref_block_size)
424 {
425    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
426 
427    const struct brw_task_prog_data *task_prog_data =
428       anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ?
429       get_task_prog_data(pipeline) : NULL;
430    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
431 
432    const struct intel_mesh_urb_allocation alloc =
433       intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config,
434                                 task_prog_data ? task_prog_data->map.size_dw : 0,
435                                 mesh_prog_data->map.size_dw);
436 
437    /* Zero out the primitive pipeline URB allocations. */
438    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
439       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
440          urb._3DCommandSubOpcode += i;
441       }
442    }
443 
444    anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
445       if (task_prog_data) {
446          urb.TASKURBEntryAllocationSize   = alloc.task_entry_size_64b - 1;
447          urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
448          urb.TASKNumberofURBEntriesSliceN = alloc.task_entries;
449          urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb;
450          urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb;
451       }
452    }
453 
454    anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
455       urb.MESHURBEntryAllocationSize   = alloc.mesh_entry_size_64b - 1;
456       urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
457       urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
458       urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb;
459       urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb;
460    }
461 
462    *deref_block_size = alloc.deref_block_size;
463 }
464 #endif
465 
466 static void
emit_urb_setup(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)467 emit_urb_setup(struct anv_graphics_pipeline *pipeline,
468                enum intel_urb_deref_block_size *deref_block_size)
469 {
470 #if GFX_VERx10 >= 125
471    if (anv_pipeline_is_mesh(pipeline)) {
472       emit_urb_setup_mesh(pipeline, deref_block_size);
473       return;
474    }
475 #endif
476    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
477       const struct brw_vue_prog_data *prog_data =
478          !anv_pipeline_has_stage(pipeline, i) ? NULL :
479          (const struct brw_vue_prog_data *) pipeline->base.shaders[i]->prog_data;
480 
481       pipeline->urb_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
482    }
483 
484    struct anv_device *device = pipeline->base.base.device;
485    const struct intel_device_info *devinfo = device->info;
486 
487 
488    bool constrained;
489    intel_get_urb_config(devinfo,
490                         pipeline->base.base.l3_config,
491                         pipeline->base.base.active_stages &
492                            VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
493                         pipeline->base.base.active_stages &
494                            VK_SHADER_STAGE_GEOMETRY_BIT,
495                         &pipeline->urb_cfg, deref_block_size,
496                         &constrained);
497 
498    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
499       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
500          urb._3DCommandSubOpcode      += i;
501          urb.VSURBStartingAddress      = pipeline->urb_cfg.start[i];
502          urb.VSURBEntryAllocationSize  = pipeline->urb_cfg.size[i] - 1;
503          urb.VSNumberofURBEntries      = pipeline->urb_cfg.entries[i];
504       }
505    }
506 
507 #if GFX_VERx10 >= 125
508    if (device->vk.enabled_extensions.EXT_mesh_shader) {
509       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
510       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
511    }
512 #endif
513 
514 }
515 
516 static bool
sbe_primitive_id_override(struct anv_graphics_pipeline * pipeline)517 sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline)
518 {
519    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
520    if (!wm_prog_data)
521       return false;
522 
523    const struct intel_vue_map *fs_input_map =
524       &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
525 
526    return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
527           fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1;
528 }
529 
530 static void
emit_3dstate_sbe(struct anv_graphics_pipeline * pipeline)531 emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
532 {
533    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
534 
535    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
536       anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
537       anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
538 #if GFX_VERx10 >= 125
539       if (anv_pipeline_is_mesh(pipeline))
540          anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
541 #endif
542       return;
543    }
544 
545    anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
546    anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
547 
548       /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
549        * calculate_urb_setup() and related functions.
550        */
551       sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
552       sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
553       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
554       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
555 
556       for (unsigned i = 0; i < 32; i++)
557          sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
558 
559       if (anv_pipeline_is_primitive(pipeline)) {
560          const struct intel_vue_map *fs_input_map =
561             &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
562 
563          int first_slot =
564             brw_compute_first_urb_slot_required(wm_prog_data->inputs,
565                                                 fs_input_map);
566          assert(first_slot % 2 == 0);
567          unsigned urb_entry_read_offset = first_slot / 2;
568          int max_source_attr = 0;
569          for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
570             uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
571             int input_index = wm_prog_data->urb_setup[attr];
572 
573             assert(0 <= input_index);
574 
575             /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
576              * VUE header
577              */
578             if (attr == VARYING_SLOT_VIEWPORT ||
579                 attr == VARYING_SLOT_LAYER ||
580                 attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
581                continue;
582             }
583 
584             if (attr == VARYING_SLOT_PNTC) {
585                sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
586                continue;
587             }
588 
589             const int slot = fs_input_map->varying_to_slot[attr];
590 
591             if (slot == -1) {
592                /* This attribute does not exist in the VUE--that means that
593                 * the vertex shader did not write to it. It could be that it's
594                 * a regular varying read by the fragment shader but not
595                 * written by the vertex shader or it's gl_PrimitiveID. In the
596                 * first case the value is undefined, in the second it needs to
597                 * be gl_PrimitiveID.
598                 */
599                swiz.Attribute[input_index].ConstantSource = PRIM_ID;
600                swiz.Attribute[input_index].ComponentOverrideX = true;
601                swiz.Attribute[input_index].ComponentOverrideY = true;
602                swiz.Attribute[input_index].ComponentOverrideZ = true;
603                swiz.Attribute[input_index].ComponentOverrideW = true;
604                continue;
605             }
606 
607             /* We have to subtract two slots to account for the URB entry
608              * output read offset in the VS and GS stages.
609              */
610             const int source_attr = slot - 2 * urb_entry_read_offset;
611             assert(source_attr >= 0 && source_attr < 32);
612             max_source_attr = MAX2(max_source_attr, source_attr);
613             /* The hardware can only do overrides on 16 overrides at a time,
614              * and the other up to 16 have to be lined up so that the input
615              * index = the output index. We'll need to do some tweaking to
616              * make sure that's the case.
617              */
618             if (input_index < 16)
619                swiz.Attribute[input_index].SourceAttribute = source_attr;
620             else
621                assert(source_attr == input_index);
622          }
623 
624          sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
625          sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
626          sbe.ForceVertexURBEntryReadOffset = true;
627          sbe.ForceVertexURBEntryReadLength = true;
628 
629          /* Ask the hardware to supply PrimitiveID if the fragment shader
630           * reads it but a previous stage didn't write one.
631           */
632          if (sbe_primitive_id_override(pipeline)) {
633             sbe.PrimitiveIDOverrideAttributeSelect =
634                wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
635             sbe.PrimitiveIDOverrideComponentX = true;
636             sbe.PrimitiveIDOverrideComponentY = true;
637             sbe.PrimitiveIDOverrideComponentZ = true;
638             sbe.PrimitiveIDOverrideComponentW = true;
639          }
640       } else {
641          assert(anv_pipeline_is_mesh(pipeline));
642 #if GFX_VERx10 >= 125
643          const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
644          anv_pipeline_emit(pipeline, final.sbe_mesh,
645                            GENX(3DSTATE_SBE_MESH), sbe_mesh) {
646             const struct brw_mue_map *mue = &mesh_prog_data->map;
647 
648             assert(mue->per_vertex_header_size_dw % 8 == 0);
649             sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
650             sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
651 
652             /* Clip distance array is passed in the per-vertex header so that
653              * it can be consumed by the HW. If user wants to read it in the
654              * FS, adjust the offset and length to cover it. Conveniently it
655              * is at the end of the per-vertex header, right before per-vertex
656              * attributes.
657              *
658              * Note that FS attribute reading must be aware that the clip
659              * distances have fixed position.
660              */
661             if (mue->per_vertex_header_size_dw > 8 &&
662                 (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
663                  wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
664                sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
665                sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
666             }
667 
668             if (mue->user_data_in_vertex_header) {
669                sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
670                sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
671             }
672 
673             assert(mue->per_primitive_header_size_dw % 8 == 0);
674             sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
675                mue->per_primitive_header_size_dw / 8;
676             sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
677                DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
678 
679             /* Just like with clip distances, if Primitive Shading Rate,
680              * Viewport Index or Layer is read back in the FS, adjust the
681              * offset and length to cover the Primitive Header, where PSR,
682              * Viewport Index & Layer are stored.
683              */
684             if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
685                 wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
686                 wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
687                 mue->user_data_in_primitive_header) {
688                assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
689                sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
690                sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
691             }
692          }
693 #endif
694       }
695    }
696    }
697 }
698 
699 /** Returns the final polygon mode for rasterization
700  *
701  * This function takes into account polygon mode, primitive topology and the
702  * different shader stages which might generate their own type of primitives.
703  */
704 VkPolygonMode
genX(raster_polygon_mode)705 genX(raster_polygon_mode)(const struct anv_graphics_pipeline *pipeline,
706                           VkPolygonMode polygon_mode,
707                           VkPrimitiveTopology primitive_topology)
708 {
709    if (anv_pipeline_is_mesh(pipeline)) {
710       switch (get_mesh_prog_data(pipeline)->primitive_type) {
711       case MESA_PRIM_POINTS:
712          return VK_POLYGON_MODE_POINT;
713       case MESA_PRIM_LINES:
714          return VK_POLYGON_MODE_LINE;
715       case MESA_PRIM_TRIANGLES:
716          return polygon_mode;
717       default:
718          unreachable("invalid primitive type for mesh");
719       }
720    } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
721       switch (get_gs_prog_data(pipeline)->output_topology) {
722       case _3DPRIM_POINTLIST:
723          return VK_POLYGON_MODE_POINT;
724 
725       case _3DPRIM_LINELIST:
726       case _3DPRIM_LINESTRIP:
727       case _3DPRIM_LINELOOP:
728          return VK_POLYGON_MODE_LINE;
729 
730       case _3DPRIM_TRILIST:
731       case _3DPRIM_TRIFAN:
732       case _3DPRIM_TRISTRIP:
733       case _3DPRIM_RECTLIST:
734       case _3DPRIM_QUADLIST:
735       case _3DPRIM_QUADSTRIP:
736       case _3DPRIM_POLYGON:
737          return polygon_mode;
738       }
739       unreachable("Unsupported GS output topology");
740    } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
741       switch (get_tes_prog_data(pipeline)->output_topology) {
742       case INTEL_TESS_OUTPUT_TOPOLOGY_POINT:
743          return VK_POLYGON_MODE_POINT;
744 
745       case INTEL_TESS_OUTPUT_TOPOLOGY_LINE:
746          return VK_POLYGON_MODE_LINE;
747 
748       case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW:
749       case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
750          return polygon_mode;
751       }
752       unreachable("Unsupported TCS output topology");
753    } else {
754       switch (primitive_topology) {
755       case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
756          return VK_POLYGON_MODE_POINT;
757 
758       case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
759       case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
760       case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
761       case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
762          return VK_POLYGON_MODE_LINE;
763 
764       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
765       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
766       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
767       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
768       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
769          return polygon_mode;
770 
771       default:
772          unreachable("Unsupported primitive topology");
773       }
774    }
775 }
776 
777 const uint32_t genX(vk_to_intel_cullmode)[] = {
778    [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
779    [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
780    [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
781    [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
782 };
783 
784 const uint32_t genX(vk_to_intel_fillmode)[] = {
785    [VK_POLYGON_MODE_FILL]                    = FILL_MODE_SOLID,
786    [VK_POLYGON_MODE_LINE]                    = FILL_MODE_WIREFRAME,
787    [VK_POLYGON_MODE_POINT]                   = FILL_MODE_POINT,
788 };
789 
790 const uint32_t genX(vk_to_intel_front_face)[] = {
791    [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
792    [VK_FRONT_FACE_CLOCKWISE]                 = 0
793 };
794 
795 static void
emit_rs_state(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_rasterization_state * rs,const struct vk_multisample_state * ms,const struct vk_render_pass_state * rp,enum intel_urb_deref_block_size urb_deref_block_size)796 emit_rs_state(struct anv_graphics_pipeline *pipeline,
797               const struct vk_input_assembly_state *ia,
798               const struct vk_rasterization_state *rs,
799               const struct vk_multisample_state *ms,
800               const struct vk_render_pass_state *rp,
801               enum intel_urb_deref_block_size urb_deref_block_size)
802 {
803    anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
804       sf.ViewportTransformEnable = true;
805       sf.StatisticsEnable = true;
806       sf.VertexSubPixelPrecisionSelect = _8Bit;
807       sf.AALineDistanceMode = true;
808 
809 #if GFX_VER >= 12
810       sf.DerefBlockSize = urb_deref_block_size;
811 #endif
812 
813       bool point_from_shader;
814       if (anv_pipeline_is_primitive(pipeline)) {
815          const struct brw_vue_prog_data *last_vue_prog_data =
816             anv_pipeline_get_last_vue_prog_data(pipeline);
817          point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
818       } else {
819          assert(anv_pipeline_is_mesh(pipeline));
820          const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
821          point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
822       }
823 
824       if (point_from_shader) {
825          sf.PointWidthSource = Vertex;
826       } else {
827          sf.PointWidthSource = State;
828          sf.PointWidth = 1.0;
829       }
830    }
831 
832    anv_pipeline_emit(pipeline, partial.raster, GENX(3DSTATE_RASTER), raster) {
833       /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
834        * "Multisample Modes State".
835        */
836       /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
837        * computations.  If we ever set this bit to a different value, they will
838        * need to be updated accordingly.
839        */
840       raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
841       raster.ForceMultisampling = false;
842 
843       raster.ScissorRectangleEnable = true;
844    }
845 }
846 
847 static void
emit_ms_state(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms)848 emit_ms_state(struct anv_graphics_pipeline *pipeline,
849               const struct vk_multisample_state *ms)
850 {
851    anv_pipeline_emit(pipeline, final.ms, GENX(3DSTATE_MULTISAMPLE), ms) {
852       ms.NumberofMultisamples       = __builtin_ffs(pipeline->rasterization_samples) - 1;
853 
854       ms.PixelLocation              = CENTER;
855 
856       /* The PRM says that this bit is valid only for DX9:
857        *
858        *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
859        *    should not have any effect by setting or not setting this bit.
860        */
861       ms.PixelPositionOffsetEnable  = false;
862    }
863 }
864 
865 const uint32_t genX(vk_to_intel_logic_op)[] = {
866    [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
867    [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
868    [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
869    [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
870    [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
871    [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
872    [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
873    [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
874    [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
875    [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
876    [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
877    [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
878    [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
879    [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
880    [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
881    [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
882 };
883 
884 const uint32_t genX(vk_to_intel_compare_op)[] = {
885    [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_NEVER,
886    [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LESS,
887    [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_EQUAL,
888    [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LEQUAL,
889    [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GREATER,
890    [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_NOTEQUAL,
891    [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GEQUAL,
892    [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_ALWAYS,
893 };
894 
895 const uint32_t genX(vk_to_intel_stencil_op)[] = {
896    [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
897    [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
898    [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
899    [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
900    [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
901    [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
902    [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
903    [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
904 };
905 
906 const uint32_t genX(vk_to_intel_primitive_type)[] = {
907    [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
908    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
909    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
910    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
911    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
912    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
913    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
914    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
915    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
916    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
917 };
918 
919 static void
emit_3dstate_clip(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)920 emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
921                   const struct vk_input_assembly_state *ia,
922                   const struct vk_viewport_state *vp,
923                   const struct vk_rasterization_state *rs)
924 {
925    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
926    (void) wm_prog_data;
927 
928    anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
929       clip.ClipEnable               = true;
930       clip.StatisticsEnable         = true;
931       clip.EarlyCullEnable          = true;
932       clip.GuardbandClipTestEnable  = true;
933 
934       clip.VertexSubPixelPrecisionSelect = _8Bit;
935       clip.ClipMode = CLIPMODE_NORMAL;
936 
937       clip.MinimumPointWidth = 0.125;
938       clip.MaximumPointWidth = 255.875;
939 
940       /* TODO(mesh): Multiview. */
941       if (anv_pipeline_is_primitive(pipeline)) {
942          const struct brw_vue_prog_data *last =
943             anv_pipeline_get_last_vue_prog_data(pipeline);
944 
945          /* From the Vulkan 1.0.45 spec:
946           *
947           *    "If the last active vertex processing stage shader entry
948           *    point's interface does not include a variable decorated with
949           *    ViewportIndex, then the first viewport is used."
950           */
951          if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
952             clip.MaximumVPIndex = vp->viewport_count > 0 ?
953                vp->viewport_count - 1 : 0;
954          } else {
955             clip.MaximumVPIndex = 0;
956          }
957 
958          /* From the Vulkan 1.0.45 spec:
959           *
960           *    "If the last active vertex processing stage shader entry point's
961           *    interface does not include a variable decorated with Layer, then
962           *    the first layer is used."
963           */
964          clip.ForceZeroRTAIndexEnable =
965             !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
966 
967       } else if (anv_pipeline_is_mesh(pipeline)) {
968          const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
969          if (vp && vp->viewport_count > 0 &&
970              mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
971             clip.MaximumVPIndex = vp->viewport_count - 1;
972          } else {
973             clip.MaximumVPIndex = 0;
974          }
975 
976          clip.ForceZeroRTAIndexEnable =
977             mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
978       }
979 
980       clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
981          wm_prog_data->uses_nonperspective_interp_modes : 0;
982    }
983 
984 #if GFX_VERx10 >= 125
985    if (anv_pipeline_is_mesh(pipeline)) {
986       const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
987       anv_pipeline_emit(pipeline, final.clip_mesh,
988                         GENX(3DSTATE_CLIP_MESH), clip_mesh) {
989          clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
990          clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
991          clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
992       }
993    }
994 #endif
995 }
996 
997 static void
emit_3dstate_streamout(struct anv_graphics_pipeline * pipeline,const struct vk_rasterization_state * rs)998 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
999                        const struct vk_rasterization_state *rs)
1000 {
1001    const struct brw_vue_prog_data *prog_data =
1002       anv_pipeline_get_last_vue_prog_data(pipeline);
1003    const struct intel_vue_map *vue_map = &prog_data->vue_map;
1004 
1005    nir_xfb_info *xfb_info;
1006    if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1007       xfb_info = pipeline->base.shaders[MESA_SHADER_GEOMETRY]->xfb_info;
1008    else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
1009       xfb_info = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
1010    else
1011       xfb_info = pipeline->base.shaders[MESA_SHADER_VERTEX]->xfb_info;
1012 
1013    if (xfb_info) {
1014       struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
1015       int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
1016       int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
1017 
1018       memset(so_decl, 0, sizeof(so_decl));
1019 
1020       for (unsigned i = 0; i < xfb_info->output_count; i++) {
1021          const nir_xfb_output_info *output = &xfb_info->outputs[i];
1022          unsigned buffer = output->buffer;
1023          unsigned stream = xfb_info->buffer_to_stream[buffer];
1024 
1025          /* Our hardware is unusual in that it requires us to program SO_DECLs
1026           * for fake "hole" components, rather than simply taking the offset
1027           * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
1028           * program as many size = 4 holes as we can, then a final hole to
1029           * accommodate the final 1, 2, or 3 remaining.
1030           */
1031          int hole_dwords = (output->offset - next_offset[buffer]) / 4;
1032          while (hole_dwords > 0) {
1033             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1034                .HoleFlag = 1,
1035                .OutputBufferSlot = buffer,
1036                .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
1037             };
1038             hole_dwords -= 4;
1039          }
1040 
1041          int varying = output->location;
1042          uint8_t component_mask = output->component_mask;
1043          /* VARYING_SLOT_PSIZ contains four scalar fields packed together:
1044           * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x
1045           * - VARYING_SLOT_LAYER                  in VARYING_SLOT_PSIZ.y
1046           * - VARYING_SLOT_VIEWPORT               in VARYING_SLOT_PSIZ.z
1047           * - VARYING_SLOT_PSIZ                   in VARYING_SLOT_PSIZ.w
1048           */
1049          if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
1050             varying = VARYING_SLOT_PSIZ;
1051             component_mask = 1 << 0; // SO_DECL_COMPMASK_X
1052          } else if (varying == VARYING_SLOT_LAYER) {
1053             varying = VARYING_SLOT_PSIZ;
1054             component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
1055          } else if (varying == VARYING_SLOT_VIEWPORT) {
1056             varying = VARYING_SLOT_PSIZ;
1057             component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
1058          } else if (varying == VARYING_SLOT_PSIZ) {
1059             component_mask = 1 << 3; // SO_DECL_COMPMASK_W
1060          }
1061 
1062          next_offset[buffer] = output->offset +
1063                                __builtin_popcount(component_mask) * 4;
1064 
1065          const int slot = vue_map->varying_to_slot[varying];
1066          if (slot < 0) {
1067             /* This can happen if the shader never writes to the varying.
1068              * Insert a hole instead of actual varying data.
1069              */
1070             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1071                .HoleFlag = true,
1072                .OutputBufferSlot = buffer,
1073                .ComponentMask = component_mask,
1074             };
1075          } else {
1076             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1077                .OutputBufferSlot = buffer,
1078                .RegisterIndex = slot,
1079                .ComponentMask = component_mask,
1080             };
1081          }
1082       }
1083 
1084       int max_decls = 0;
1085       for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
1086          max_decls = MAX2(max_decls, decls[s]);
1087 
1088       uint8_t sbs[MAX_XFB_STREAMS] = { };
1089       for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
1090          if (xfb_info->buffers_written & (1 << b))
1091             sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
1092       }
1093 
1094       uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
1095                                         3 + 2 * max_decls,
1096                                         GENX(3DSTATE_SO_DECL_LIST),
1097                                         .StreamtoBufferSelects0 = sbs[0],
1098                                         .StreamtoBufferSelects1 = sbs[1],
1099                                         .StreamtoBufferSelects2 = sbs[2],
1100                                         .StreamtoBufferSelects3 = sbs[3],
1101                                         .NumEntries0 = decls[0],
1102                                         .NumEntries1 = decls[1],
1103                                         .NumEntries2 = decls[2],
1104                                         .NumEntries3 = decls[3]);
1105 
1106       for (int i = 0; i < max_decls; i++) {
1107          GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
1108             &(struct GENX(SO_DECL_ENTRY)) {
1109                .Stream0Decl = so_decl[0][i],
1110                .Stream1Decl = so_decl[1][i],
1111                .Stream2Decl = so_decl[2][i],
1112                .Stream3Decl = so_decl[3][i],
1113             });
1114       }
1115    }
1116 
1117    anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
1118       if (xfb_info) {
1119          pipeline->uses_xfb = true;
1120 
1121          so.SOFunctionEnable = true;
1122          so.SOStatisticsEnable = true;
1123 
1124          so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
1125          so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
1126          so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
1127          so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
1128 
1129          int urb_entry_read_offset = 0;
1130          int urb_entry_read_length =
1131             (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
1132 
1133          /* We always read the whole vertex. This could be reduced at some
1134           * point by reading less and offsetting the register index in the
1135           * SO_DECLs.
1136           */
1137          so.Stream0VertexReadOffset = urb_entry_read_offset;
1138          so.Stream0VertexReadLength = urb_entry_read_length - 1;
1139          so.Stream1VertexReadOffset = urb_entry_read_offset;
1140          so.Stream1VertexReadLength = urb_entry_read_length - 1;
1141          so.Stream2VertexReadOffset = urb_entry_read_offset;
1142          so.Stream2VertexReadLength = urb_entry_read_length - 1;
1143          so.Stream3VertexReadOffset = urb_entry_read_offset;
1144          so.Stream3VertexReadLength = urb_entry_read_length - 1;
1145       }
1146    }
1147 }
1148 
1149 static uint32_t
get_sampler_count(const struct anv_shader_bin * bin)1150 get_sampler_count(const struct anv_shader_bin *bin)
1151 {
1152    uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4);
1153 
1154    /* We can potentially have way more than 32 samplers and that's ok.
1155     * However, the 3DSTATE_XS packets only have 3 bits to specify how
1156     * many to pre-fetch and all values above 4 are marked reserved.
1157     */
1158    return MIN2(count_by_4, 4);
1159 }
1160 
1161 static UNUSED struct anv_address
get_scratch_address(struct anv_pipeline * pipeline,gl_shader_stage stage,const struct anv_shader_bin * bin)1162 get_scratch_address(struct anv_pipeline *pipeline,
1163                     gl_shader_stage stage,
1164                     const struct anv_shader_bin *bin)
1165 {
1166    return (struct anv_address) {
1167       .bo = anv_scratch_pool_alloc(pipeline->device,
1168                                    &pipeline->device->scratch_pool,
1169                                    stage, bin->prog_data->total_scratch),
1170       .offset = 0,
1171    };
1172 }
1173 
1174 static UNUSED uint32_t
get_scratch_space(const struct anv_shader_bin * bin)1175 get_scratch_space(const struct anv_shader_bin *bin)
1176 {
1177    return ffs(bin->prog_data->total_scratch / 2048);
1178 }
1179 
1180 static UNUSED uint32_t
get_scratch_surf(struct anv_pipeline * pipeline,gl_shader_stage stage,const struct anv_shader_bin * bin)1181 get_scratch_surf(struct anv_pipeline *pipeline,
1182                  gl_shader_stage stage,
1183                  const struct anv_shader_bin *bin)
1184 {
1185    if (bin->prog_data->total_scratch == 0)
1186       return 0;
1187 
1188    struct anv_bo *bo =
1189       anv_scratch_pool_alloc(pipeline->device,
1190                              &pipeline->device->scratch_pool,
1191                              stage, bin->prog_data->total_scratch);
1192    anv_reloc_list_add_bo(pipeline->batch.relocs, bo);
1193    return anv_scratch_pool_get_surf(pipeline->device,
1194                                     &pipeline->device->scratch_pool,
1195                                     bin->prog_data->total_scratch) >> 4;
1196 }
1197 
1198 static void
emit_3dstate_vs(struct anv_graphics_pipeline * pipeline)1199 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
1200 {
1201    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1202    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1203    const struct anv_shader_bin *vs_bin =
1204       pipeline->base.shaders[MESA_SHADER_VERTEX];
1205 
1206    assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
1207 
1208    anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs) {
1209       vs.Enable               = true;
1210       vs.StatisticsEnable     = true;
1211       vs.KernelStartPointer   = vs_bin->kernel.offset;
1212 #if GFX_VER < 20
1213       vs.SIMD8DispatchEnable  =
1214          vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
1215 #endif
1216 
1217       assert(!vs_prog_data->base.base.use_alt_mode);
1218 #if GFX_VER < 11
1219       vs.SingleVertexDispatch       = false;
1220 #endif
1221       vs.VectorMaskEnable           = false;
1222       /* Wa_1606682166:
1223        * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
1224        * Disable the Sampler state prefetch functionality in the SARB by
1225        * programming 0xB000[30] to '1'.
1226        */
1227       vs.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
1228       vs.BindingTableEntryCount     = vs_bin->bind_map.surface_count;
1229       vs.FloatingPointMode          = IEEE754;
1230       vs.IllegalOpcodeExceptionEnable = false;
1231       vs.SoftwareExceptionEnable    = false;
1232       vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;
1233 
1234       if (GFX_VER == 9 && devinfo->gt == 4 &&
1235           anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1236          /* On Sky Lake GT4, we have experienced some hangs related to the VS
1237           * cache and tessellation.  It is unknown exactly what is happening
1238           * but the Haswell docs for the "VS Reference Count Full Force Miss
1239           * Enable" field of the "Thread Mode" register refer to a HSW bug in
1240           * which the VUE handle reference count would overflow resulting in
1241           * internal reference counting bugs.  My (Faith's) best guess is that
1242           * this bug cropped back up on SKL GT4 when we suddenly had more
1243           * threads in play than any previous gfx9 hardware.
1244           *
1245           * What we do know for sure is that setting this bit when
1246           * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
1247           * City when playing with DXVK (https://bugs.freedesktop.org/107280).
1248           * Disabling the vertex cache with tessellation shaders should only
1249           * have a minor performance impact as the tessellation shaders are
1250           * likely generating and processing far more geometry than the vertex
1251           * stage.
1252           */
1253          vs.VertexCacheDisable = true;
1254       }
1255 
1256       vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
1257       vs.VertexURBEntryReadOffset      = 0;
1258       vs.DispatchGRFStartRegisterForURBData =
1259          vs_prog_data->base.base.dispatch_grf_start_reg;
1260 
1261       vs.UserClipDistanceClipTestEnableBitmask =
1262          vs_prog_data->base.clip_distance_mask;
1263       vs.UserClipDistanceCullTestEnableBitmask =
1264          vs_prog_data->base.cull_distance_mask;
1265 
1266 #if GFX_VERx10 >= 125
1267       vs.ScratchSpaceBuffer =
1268          get_scratch_surf(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
1269 #else
1270       vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
1271       vs.ScratchSpaceBasePointer =
1272          get_scratch_address(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
1273 #endif
1274    }
1275 }
1276 
1277 static void
emit_3dstate_hs_ds(struct anv_graphics_pipeline * pipeline,const struct vk_tessellation_state * ts)1278 emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
1279                    const struct vk_tessellation_state *ts)
1280 {
1281    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1282       anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
1283       anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
1284       return;
1285    }
1286 
1287    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1288    const struct anv_shader_bin *tcs_bin =
1289       pipeline->base.shaders[MESA_SHADER_TESS_CTRL];
1290    const struct anv_shader_bin *tes_bin =
1291       pipeline->base.shaders[MESA_SHADER_TESS_EVAL];
1292 
1293    const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1294    const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1295 
1296    anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs) {
1297       hs.Enable = true;
1298       hs.StatisticsEnable = true;
1299       hs.KernelStartPointer = tcs_bin->kernel.offset;
1300       /* Wa_1606682166 */
1301       hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
1302       hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
1303 
1304 #if GFX_VER >= 12
1305       /* Wa_1604578095:
1306        *
1307        *    Hang occurs when the number of max threads is less than 2 times
1308        *    the number of instance count. The number of max threads must be
1309        *    more than 2 times the number of instance count.
1310        */
1311       assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
1312 #endif
1313 
1314       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1315       hs.IncludeVertexHandles = true;
1316       hs.InstanceCount = tcs_prog_data->instances - 1;
1317 
1318       hs.VertexURBEntryReadLength = 0;
1319       hs.VertexURBEntryReadOffset = 0;
1320       hs.DispatchGRFStartRegisterForURBData =
1321          tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
1322 #if GFX_VER >= 12
1323       hs.DispatchGRFStartRegisterForURBData5 =
1324          tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
1325 #endif
1326 
1327 #if GFX_VERx10 >= 125
1328       hs.ScratchSpaceBuffer =
1329          get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
1330 #else
1331       hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1332       hs.ScratchSpaceBasePointer =
1333          get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
1334 #endif
1335 
1336 #if GFX_VER == 12
1337       /*  Patch Count threshold specifies the maximum number of patches that
1338        *  will be accumulated before a thread dispatch is forced.
1339        */
1340       hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
1341 #endif
1342 
1343 #if GFX_VER < 20
1344       hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
1345 #endif
1346       hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
1347    };
1348 
1349    anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds) {
1350       ds.Enable = true;
1351       ds.StatisticsEnable = true;
1352       ds.KernelStartPointer = tes_bin->kernel.offset;
1353       /* Wa_1606682166 */
1354       ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
1355       ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
1356       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1357 
1358       ds.ComputeWCoordinateEnable =
1359          tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
1360 
1361       ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1362       ds.PatchURBEntryReadOffset = 0;
1363       ds.DispatchGRFStartRegisterForURBData =
1364          tes_prog_data->base.base.dispatch_grf_start_reg;
1365 
1366 #if GFX_VER < 11
1367       ds.DispatchMode =
1368          tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1369          DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1370          DISPATCH_MODE_SIMD4X2;
1371 #else
1372       assert(tes_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
1373       ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
1374 #endif
1375 
1376       ds.UserClipDistanceClipTestEnableBitmask =
1377          tes_prog_data->base.clip_distance_mask;
1378       ds.UserClipDistanceCullTestEnableBitmask =
1379          tes_prog_data->base.cull_distance_mask;
1380 
1381 #if GFX_VER >= 12
1382       ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
1383 #endif
1384 #if GFX_VERx10 >= 125
1385       ds.ScratchSpaceBuffer =
1386          get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
1387 #else
1388       ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1389       ds.ScratchSpaceBasePointer =
1390          get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
1391 #endif
1392    }
1393 }
1394 
1395 static UNUSED bool
geom_or_tess_prim_id_used(struct anv_graphics_pipeline * pipeline)1396 geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
1397 {
1398    const struct brw_tcs_prog_data *tcs_prog_data =
1399       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ?
1400       get_tcs_prog_data(pipeline) : NULL;
1401    const struct brw_tes_prog_data *tes_prog_data =
1402       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ?
1403       get_tes_prog_data(pipeline) : NULL;
1404    const struct brw_gs_prog_data *gs_prog_data =
1405       anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) ?
1406       get_gs_prog_data(pipeline) : NULL;
1407 
1408    return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
1409           (tes_prog_data && tes_prog_data->include_primitive_id) ||
1410           (gs_prog_data && gs_prog_data->include_primitive_id);
1411 }
1412 
1413 static void
emit_3dstate_te(struct anv_graphics_pipeline * pipeline)1414 emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
1415 {
1416    anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
1417       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1418          const struct brw_tes_prog_data *tes_prog_data =
1419             get_tes_prog_data(pipeline);
1420 
1421          te.Partitioning = tes_prog_data->partitioning;
1422          te.TEDomain = tes_prog_data->domain;
1423          te.TEEnable = true;
1424          te.MaximumTessellationFactorOdd = 63.0;
1425          te.MaximumTessellationFactorNotOdd = 64.0;
1426 #if GFX_VERx10 >= 125
1427          const struct anv_device *device = pipeline->base.base.device;
1428          if (intel_needs_workaround(device->info, 22012699309))
1429             te.TessellationDistributionMode = TEDMODE_RR_STRICT;
1430          else
1431             te.TessellationDistributionMode = TEDMODE_RR_FREE;
1432 
1433          if (intel_needs_workaround(device->info, 14015055625)) {
1434             /* Wa_14015055625:
1435              *
1436              * Disable Tessellation Distribution when primitive Id is enabled.
1437              */
1438             if (sbe_primitive_id_override(pipeline) ||
1439                 geom_or_tess_prim_id_used(pipeline))
1440                te.TessellationDistributionMode = TEDMODE_OFF;
1441          }
1442 
1443 #if GFX_VER >= 20
1444          te.TessellationDistributionLevel = TEDLEVEL_REGION;
1445 #else
1446          te.TessellationDistributionLevel = TEDLEVEL_PATCH;
1447 #endif
1448          /* 64_TRIANGLES */
1449          te.SmallPatchThreshold = 3;
1450          /* 1K_TRIANGLES */
1451          te.TargetBlockSize = 8;
1452          /* 1K_TRIANGLES */
1453          te.LocalBOPAccumulatorThreshold = 1;
1454 #endif
1455       }
1456    }
1457 }
1458 
1459 static void
emit_3dstate_gs(struct anv_graphics_pipeline * pipeline)1460 emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
1461 {
1462    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1463       anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
1464       return;
1465    }
1466 
1467    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1468    const struct anv_shader_bin *gs_bin =
1469       pipeline->base.shaders[MESA_SHADER_GEOMETRY];
1470    const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
1471 
1472    anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs) {
1473       gs.Enable                  = true;
1474       gs.StatisticsEnable        = true;
1475       gs.KernelStartPointer      = gs_bin->kernel.offset;
1476 #if GFX_VER < 20
1477       gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
1478 #endif
1479 
1480       gs.SingleProgramFlow       = false;
1481       gs.VectorMaskEnable        = false;
1482       /* Wa_1606682166 */
1483       gs.SamplerCount            = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
1484       gs.BindingTableEntryCount  = gs_bin->bind_map.surface_count;
1485       gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
1486       gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
1487 
1488       gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
1489 
1490       gs.OutputVertexSize        = gs_prog_data->output_vertex_size_hwords * 2 - 1;
1491       gs.OutputTopology          = gs_prog_data->output_topology;
1492       gs.ControlDataFormat       = gs_prog_data->control_data_format;
1493       gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
1494       gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
1495 
1496       gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
1497       gs.StaticOutput            = gs_prog_data->static_vertex_count >= 0;
1498       gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
1499          gs_prog_data->static_vertex_count : 0;
1500 
1501       gs.VertexURBEntryReadOffset = 0;
1502       gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1503       gs.DispatchGRFStartRegisterForURBData =
1504          gs_prog_data->base.base.dispatch_grf_start_reg;
1505 
1506       gs.UserClipDistanceClipTestEnableBitmask =
1507          gs_prog_data->base.clip_distance_mask;
1508       gs.UserClipDistanceCullTestEnableBitmask =
1509          gs_prog_data->base.cull_distance_mask;
1510 
1511 #if GFX_VERx10 >= 125
1512       gs.ScratchSpaceBuffer =
1513          get_scratch_surf(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
1514 #else
1515       gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
1516       gs.ScratchSpaceBasePointer =
1517          get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
1518 #endif
1519    }
1520 }
1521 
1522 static void
emit_3dstate_wm(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_rasterization_state * rs,const struct vk_multisample_state * ms,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)1523 emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
1524                 const struct vk_input_assembly_state *ia,
1525                 const struct vk_rasterization_state *rs,
1526                 const struct vk_multisample_state *ms,
1527                 const struct vk_color_blend_state *cb,
1528                 const struct vk_render_pass_state *rp)
1529 {
1530    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1531 
1532    anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
1533       wm.StatisticsEnable                    = true;
1534       wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
1535       wm.LineAntialiasingRegionWidth         = _10pixels;
1536       wm.PointRasterizationRule              = RASTRULE_UPPER_LEFT;
1537 
1538       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1539          if (wm_prog_data->early_fragment_tests) {
1540             wm.EarlyDepthStencilControl         = EDSC_PREPS;
1541          } else if (wm_prog_data->has_side_effects) {
1542             wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
1543          } else {
1544             wm.EarlyDepthStencilControl         = EDSC_NORMAL;
1545          }
1546 
1547          /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
1548           * doesn't take into account KillPixels when no depth or stencil
1549           * writes are enabled. In order for occlusion queries to work
1550           * correctly with no attachments, we need to force-enable PS thread
1551           * dispatch.
1552           *
1553           * The BDW docs are pretty clear that that this bit isn't validated
1554           * and probably shouldn't be used in production:
1555           *
1556           *    "This must always be set to Normal. This field should not be
1557           *     tested for functional validation."
1558           *
1559           * Unfortunately, however, the other mechanism we have for doing this
1560           * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
1561           * Given two bad options, we choose the one which works.
1562           */
1563          pipeline->force_fragment_thread_dispatch =
1564             wm_prog_data->has_side_effects ||
1565             wm_prog_data->uses_kill;
1566 
1567          wm.BarycentricInterpolationMode =
1568             wm_prog_data_barycentric_modes(wm_prog_data,
1569                                            pipeline->fs_msaa_flags);
1570       }
1571    }
1572 }
1573 
1574 static void
emit_3dstate_ps(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms,const struct vk_color_blend_state * cb)1575 emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
1576                 const struct vk_multisample_state *ms,
1577                 const struct vk_color_blend_state *cb)
1578 {
1579    UNUSED const struct intel_device_info *devinfo =
1580       pipeline->base.base.device->info;
1581    const struct anv_shader_bin *fs_bin =
1582       pipeline->base.shaders[MESA_SHADER_FRAGMENT];
1583 
1584    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1585       anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps);
1586       return;
1587    }
1588 
1589    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1590 
1591    anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps) {
1592       intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data,
1593                                   ms != NULL ? ms->rasterization_samples : 1,
1594                                   pipeline->fs_msaa_flags);
1595 
1596       const bool persample =
1597          brw_wm_prog_data_is_persample(wm_prog_data, pipeline->fs_msaa_flags);
1598 
1599 #if GFX_VER == 12
1600       assert(wm_prog_data->dispatch_multi == 0 ||
1601              (wm_prog_data->dispatch_multi == 16 && wm_prog_data->max_polygons == 2));
1602       ps.DualSIMD8DispatchEnable = wm_prog_data->dispatch_multi;
1603       /* XXX - No major improvement observed from enabling
1604        *       overlapping subspans, but it could be helpful
1605        *       in theory when the requirements listed on the
1606        *       BSpec page for 3DSTATE_PS_BODY are met.
1607        */
1608       ps.OverlappingSubspansEnable = false;
1609 #endif
1610 
1611       ps.KernelStartPointer0 = fs_bin->kernel.offset +
1612                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
1613       ps.KernelStartPointer1 = fs_bin->kernel.offset +
1614                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
1615 #if GFX_VER < 20
1616       ps.KernelStartPointer2 = fs_bin->kernel.offset +
1617                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
1618 #endif
1619 
1620       ps.SingleProgramFlow          = false;
1621       ps.VectorMaskEnable           = wm_prog_data->uses_vmask;
1622       /* Wa_1606682166 */
1623       ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
1624       ps.BindingTableEntryCount     = fs_bin->bind_map.surface_count;
1625 #if GFX_VER < 20
1626       ps.PushConstantEnable         = wm_prog_data->base.nr_params > 0 ||
1627                                       wm_prog_data->base.ubo_ranges[0].length;
1628 #endif
1629       ps.PositionXYOffsetSelect     =
1630            !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
1631            persample ? POSOFFSET_SAMPLE : POSOFFSET_CENTROID;
1632 
1633       ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
1634 
1635       ps.DispatchGRFStartRegisterForConstantSetupData0 =
1636          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
1637       ps.DispatchGRFStartRegisterForConstantSetupData1 =
1638          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
1639 #if GFX_VER < 20
1640       ps.DispatchGRFStartRegisterForConstantSetupData2 =
1641          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
1642 #endif
1643 
1644 #if GFX_VERx10 >= 125
1645       ps.ScratchSpaceBuffer =
1646          get_scratch_surf(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
1647 #else
1648       ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
1649       ps.ScratchSpaceBasePointer =
1650          get_scratch_address(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
1651 #endif
1652    }
1653 }
1654 
1655 static void
emit_3dstate_ps_extra(struct anv_graphics_pipeline * pipeline,const struct vk_rasterization_state * rs,const struct vk_graphics_pipeline_state * state)1656 emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
1657                       const struct vk_rasterization_state *rs,
1658                       const struct vk_graphics_pipeline_state *state)
1659 {
1660    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1661 
1662    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1663       anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
1664       return;
1665    }
1666 
1667    anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
1668       ps.PixelShaderValid              = true;
1669 #if GFX_VER < 20
1670       ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
1671 #endif
1672       ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
1673       ps.PixelShaderIsPerSample        =
1674          brw_wm_prog_data_is_persample(wm_prog_data, pipeline->fs_msaa_flags);
1675       ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
1676       ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
1677       ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
1678 
1679       ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
1680 #if GFX_VER >= 20
1681       assert(!wm_prog_data->pulls_bary);
1682 #else
1683       ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
1684 #endif
1685 
1686       ps.InputCoverageMaskState = ICMS_NONE;
1687       assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
1688       if (!wm_prog_data->uses_sample_mask)
1689          ps.InputCoverageMaskState = ICMS_NONE;
1690       else if (brw_wm_prog_data_is_coarse(wm_prog_data, 0))
1691          ps.InputCoverageMaskState  = ICMS_NORMAL;
1692       else if (wm_prog_data->post_depth_coverage)
1693          ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
1694       else
1695          ps.InputCoverageMaskState = ICMS_NORMAL;
1696 
1697 #if GFX_VER >= 11
1698       ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
1699          wm_prog_data->uses_depth_w_coefficients;
1700       ps.PixelShaderIsPerCoarsePixel =
1701          brw_wm_prog_data_is_coarse(wm_prog_data, pipeline->fs_msaa_flags);
1702 #endif
1703 #if GFX_VERx10 >= 125
1704       /* TODO: We should only require this when the last geometry shader uses
1705        *       a fragment shading rate that is not constant.
1706        */
1707       ps.EnablePSDependencyOnCPsizeChange =
1708          brw_wm_prog_data_is_coarse(wm_prog_data, pipeline->fs_msaa_flags);
1709 #endif
1710    }
1711 }
1712 
1713 static void
emit_3dstate_vf_statistics(struct anv_graphics_pipeline * pipeline)1714 emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
1715 {
1716    anv_pipeline_emit(pipeline, final.vf_statistics,
1717                      GENX(3DSTATE_VF_STATISTICS), vfs) {
1718       vfs.StatisticsEnable = true;
1719    }
1720 }
1721 
1722 static void
compute_kill_pixel(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms,const struct vk_graphics_pipeline_state * state)1723 compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
1724                    const struct vk_multisample_state *ms,
1725                    const struct vk_graphics_pipeline_state *state)
1726 {
1727    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1728       pipeline->kill_pixel = false;
1729       return;
1730    }
1731 
1732    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1733 
1734    /* This computes the KillPixel portion of the computation for whether or
1735     * not we want to enable the PMA fix on gfx8 or gfx9.  It's given by this
1736     * chunk of the giant formula:
1737     *
1738     *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1739     *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1740     *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1741     *     3DSTATE_PS_BLEND::AlphaTestEnable ||
1742     *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1743     *
1744     * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
1745     * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
1746     * of an alpha test.
1747     */
1748    pipeline->rp_has_ds_self_dep =
1749       (state->pipeline_flags &
1750        VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) != 0;
1751    pipeline->kill_pixel =
1752       pipeline->rp_has_ds_self_dep ||
1753       wm_prog_data->uses_kill ||
1754       wm_prog_data->uses_omask ||
1755       (ms && ms->alpha_to_coverage_enable);
1756 }
1757 
1758 #if GFX_VER >= 12
1759 static void
emit_3dstate_primitive_replication(struct anv_graphics_pipeline * pipeline,const struct vk_render_pass_state * rp)1760 emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
1761                                    const struct vk_render_pass_state *rp)
1762 {
1763    if (anv_pipeline_is_mesh(pipeline)) {
1764       anv_pipeline_emit(pipeline, final.primitive_replication,
1765                         GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1766       return;
1767    }
1768 
1769    const int replication_count =
1770       anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots;
1771 
1772    assert(replication_count >= 1);
1773    if (replication_count == 1) {
1774       anv_pipeline_emit(pipeline, final.primitive_replication,
1775                         GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1776       return;
1777    }
1778 
1779    assert(replication_count == util_bitcount(rp->view_mask));
1780    assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
1781 
1782    anv_pipeline_emit(pipeline, final.primitive_replication,
1783                      GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
1784       pr.ReplicaMask = (1 << replication_count) - 1;
1785       pr.ReplicationCount = replication_count - 1;
1786 
1787       int i = 0;
1788       u_foreach_bit(view_index, rp->view_mask) {
1789          pr.RTAIOffset[i] = view_index;
1790          i++;
1791       }
1792    }
1793 }
1794 #endif
1795 
1796 #if GFX_VERx10 >= 125
1797 static void
emit_task_state(struct anv_graphics_pipeline * pipeline)1798 emit_task_state(struct anv_graphics_pipeline *pipeline)
1799 {
1800    assert(anv_pipeline_is_mesh(pipeline));
1801 
1802    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
1803       anv_pipeline_emit(pipeline, final.task_control,
1804                         GENX(3DSTATE_TASK_CONTROL), zero);
1805       anv_pipeline_emit(pipeline, final.task_shader,
1806                         GENX(3DSTATE_TASK_SHADER), zero);
1807       anv_pipeline_emit(pipeline, final.task_redistrib,
1808                         GENX(3DSTATE_TASK_REDISTRIB), zero);
1809       return;
1810    }
1811 
1812    const struct anv_shader_bin *task_bin =
1813       pipeline->base.shaders[MESA_SHADER_TASK];
1814 
1815    anv_pipeline_emit(pipeline, final.task_control,
1816                      GENX(3DSTATE_TASK_CONTROL), tc) {
1817       tc.TaskShaderEnable = true;
1818       tc.ScratchSpaceBuffer =
1819          get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin);
1820       tc.MaximumNumberofThreadGroups = 511;
1821    }
1822 
1823    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1824    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
1825    const struct intel_cs_dispatch_info task_dispatch =
1826       brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
1827 
1828    anv_pipeline_emit(pipeline, final.task_shader,
1829                      GENX(3DSTATE_TASK_SHADER), task) {
1830       task.KernelStartPointer                = task_bin->kernel.offset;
1831       task.SIMDSize                          = task_dispatch.simd_size / 16;
1832       task.MessageSIMD                       = task.SIMDSize;
1833       task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads;
1834       task.ExecutionMask                     = task_dispatch.right_mask;
1835       task.LocalXMaximum                     = task_dispatch.group_size - 1;
1836       task.EmitLocalIDX                      = true;
1837 
1838       task.NumberofBarriers                  = task_prog_data->base.uses_barrier;
1839       task.SharedLocalMemorySize             =
1840          encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared);
1841       task.PreferredSLMAllocationSize        =
1842          preferred_slm_allocation_size(devinfo);
1843 
1844       /*
1845        * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
1846        * of a buffer with push constants and descriptor set table and
1847        * InlineData[2:7] will be used for first few push constants.
1848        */
1849       task.EmitInlineParameter = true;
1850 
1851       task.XP0Required = task_prog_data->uses_drawid;
1852    }
1853 
1854    /* Recommended values from "Task and Mesh Distribution Programming". */
1855    anv_pipeline_emit(pipeline, final.task_redistrib,
1856                      GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
1857       redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
1858       redistrib.SmallTaskThreshold = 1; /* 2^N */
1859       redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
1860       redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM;
1861       redistrib.TaskRedistributionMode = TASKREDISTRIB_RR_STRICT;
1862    }
1863 }
1864 
1865 static void
emit_mesh_state(struct anv_graphics_pipeline * pipeline)1866 emit_mesh_state(struct anv_graphics_pipeline *pipeline)
1867 {
1868    assert(anv_pipeline_is_mesh(pipeline));
1869 
1870    const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];
1871 
1872    anv_pipeline_emit(pipeline, final.mesh_control,
1873                      GENX(3DSTATE_MESH_CONTROL), mc) {
1874       mc.MeshShaderEnable = true;
1875       mc.ScratchSpaceBuffer =
1876          get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin);
1877       mc.MaximumNumberofThreadGroups = 511;
1878    }
1879 
1880    const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1881    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
1882    const struct intel_cs_dispatch_info mesh_dispatch =
1883       brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL);
1884 
1885    const unsigned output_topology =
1886       mesh_prog_data->primitive_type == MESA_PRIM_POINTS ? OUTPUT_POINT :
1887       mesh_prog_data->primitive_type == MESA_PRIM_LINES  ? OUTPUT_LINE :
1888                                                              OUTPUT_TRI;
1889 
1890    uint32_t index_format;
1891    switch (mesh_prog_data->index_format) {
1892    case BRW_INDEX_FORMAT_U32:
1893       index_format = INDEX_U32;
1894       break;
1895    case BRW_INDEX_FORMAT_U888X:
1896       index_format = INDEX_U888X;
1897       break;
1898    default:
1899       unreachable("invalid index format");
1900    }
1901 
1902    anv_pipeline_emit(pipeline, final.mesh_shader,
1903                      GENX(3DSTATE_MESH_SHADER), mesh) {
1904       mesh.KernelStartPointer                = mesh_bin->kernel.offset;
1905       mesh.SIMDSize                          = mesh_dispatch.simd_size / 16;
1906       mesh.MessageSIMD                       = mesh.SIMDSize;
1907       mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads;
1908       mesh.ExecutionMask                     = mesh_dispatch.right_mask;
1909       mesh.LocalXMaximum                     = mesh_dispatch.group_size - 1;
1910       mesh.EmitLocalIDX                      = true;
1911 
1912       mesh.MaximumPrimitiveCount             = MAX2(mesh_prog_data->map.max_primitives, 1) - 1;
1913       mesh.OutputTopology                    = output_topology;
1914       mesh.PerVertexDataPitch                = mesh_prog_data->map.per_vertex_pitch_dw / 8;
1915       mesh.PerPrimitiveDataPresent           = mesh_prog_data->map.per_primitive_pitch_dw > 0;
1916       mesh.PerPrimitiveDataPitch             = mesh_prog_data->map.per_primitive_pitch_dw / 8;
1917       mesh.IndexFormat                       = index_format;
1918 
1919       mesh.NumberofBarriers                  = mesh_prog_data->base.uses_barrier;
1920       mesh.SharedLocalMemorySize             =
1921          encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared);
1922       mesh.PreferredSLMAllocationSize        =
1923          preferred_slm_allocation_size(devinfo);
1924 
1925       /*
1926        * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address
1927        * of a buffer with push constants and descriptor set table and
1928        * InlineData[2:7] will be used for first few push constants.
1929        */
1930       mesh.EmitInlineParameter = true;
1931 
1932       mesh.XP0Required = mesh_prog_data->uses_drawid;
1933    }
1934 
1935    /* Recommended values from "Task and Mesh Distribution Programming". */
1936    anv_pipeline_emit(pipeline, final.mesh_distrib,
1937                      GENX(3DSTATE_MESH_DISTRIB), distrib) {
1938       distrib.DistributionMode = MESH_RR_FREE;
1939       distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
1940       distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
1941    }
1942 }
1943 #endif
1944 
1945 void
genX(graphics_pipeline_emit)1946 genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
1947                              const struct vk_graphics_pipeline_state *state)
1948 {
1949    enum intel_urb_deref_block_size urb_deref_block_size;
1950    emit_urb_setup(pipeline, &urb_deref_block_size);
1951 
1952    emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp,
1953                  urb_deref_block_size);
1954    emit_ms_state(pipeline, state->ms);
1955    compute_kill_pixel(pipeline, state->ms, state);
1956 
1957    emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs);
1958 
1959 #if GFX_VER >= 12
1960    emit_3dstate_primitive_replication(pipeline, state->rp);
1961 #endif
1962 
1963 #if GFX_VERx10 >= 125
1964    anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
1965       /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
1966       vfg.DistributionMode =
1967          anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT :
1968          RR_FREE;
1969       vfg.DistributionGranularity = BatchLevelGranularity;
1970 #if INTEL_WA_14014851047_GFX_VER
1971       vfg.GranularityThresholdDisable =
1972          intel_needs_workaround(pipeline->base.base.device->info, 14014851047);
1973 #endif
1974       /* 192 vertices for TRILIST_ADJ */
1975       vfg.ListNBatchSizeScale = 0;
1976       /* Batch size of 384 vertices */
1977       vfg.List3BatchSizeScale = 2;
1978       /* Batch size of 128 vertices */
1979       vfg.List2BatchSizeScale = 1;
1980       /* Batch size of 128 vertices */
1981       vfg.List1BatchSizeScale = 2;
1982       /* Batch size of 256 vertices for STRIP topologies */
1983       vfg.StripBatchSizeScale = 3;
1984       /* 192 control points for PATCHLIST_3 */
1985       vfg.PatchBatchSizeScale = 1;
1986       /* 192 control points for PATCHLIST_3 */
1987       vfg.PatchBatchSizeMultiplier = 31;
1988    }
1989 #endif
1990 
1991    emit_3dstate_vf_statistics(pipeline);
1992 
1993    if (anv_pipeline_is_primitive(pipeline)) {
1994       emit_vertex_input(pipeline, state, state->vi);
1995 
1996       emit_3dstate_vs(pipeline);
1997       emit_3dstate_hs_ds(pipeline, state->ts);
1998       emit_3dstate_te(pipeline);
1999       emit_3dstate_gs(pipeline);
2000 
2001       emit_3dstate_streamout(pipeline, state->rs);
2002 
2003 #if GFX_VERx10 >= 125
2004       const struct anv_device *device = pipeline->base.base.device;
2005       /* Disable Mesh. */
2006       if (device->vk.enabled_extensions.EXT_mesh_shader) {
2007          anv_pipeline_emit(pipeline, final.mesh_control,
2008                            GENX(3DSTATE_MESH_CONTROL), zero);
2009          anv_pipeline_emit(pipeline, final.mesh_shader,
2010                            GENX(3DSTATE_MESH_SHADER), zero);
2011          anv_pipeline_emit(pipeline, final.mesh_distrib,
2012                            GENX(3DSTATE_MESH_DISTRIB), zero);
2013          anv_pipeline_emit(pipeline, final.clip_mesh,
2014                            GENX(3DSTATE_CLIP_MESH), zero);
2015          anv_pipeline_emit(pipeline, final.sbe_mesh,
2016                            GENX(3DSTATE_SBE_MESH), zero);
2017          anv_pipeline_emit(pipeline, final.task_control,
2018                            GENX(3DSTATE_TASK_CONTROL), zero);
2019          anv_pipeline_emit(pipeline, final.task_shader,
2020                            GENX(3DSTATE_TASK_SHADER), zero);
2021          anv_pipeline_emit(pipeline, final.task_redistrib,
2022                            GENX(3DSTATE_TASK_REDISTRIB), zero);
2023       }
2024 #endif
2025    } else {
2026       assert(anv_pipeline_is_mesh(pipeline));
2027 
2028       anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs);
2029 #if GFX_VER >= 11
2030       anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs);
2031 #endif
2032       anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs);
2033       anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
2034       anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
2035       anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te);
2036       anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
2037 
2038       /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
2039        * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
2040        */
2041       anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);
2042 
2043 #if GFX_VERx10 >= 125
2044       emit_task_state(pipeline);
2045       emit_mesh_state(pipeline);
2046 #endif
2047    }
2048 
2049    emit_3dstate_sbe(pipeline);
2050    emit_3dstate_wm(pipeline, state->ia, state->rs,
2051                    state->ms, state->cb, state->rp);
2052    emit_3dstate_ps(pipeline, state->ms, state->cb);
2053    emit_3dstate_ps_extra(pipeline, state->rs, state);
2054 }
2055 
2056 #if GFX_VERx10 >= 125
2057 
2058 void
genX(compute_pipeline_emit)2059 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
2060 {
2061    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2062    anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2063 }
2064 
2065 #else /* #if GFX_VERx10 >= 125 */
2066 
2067 void
genX(compute_pipeline_emit)2068 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
2069 {
2070    struct anv_device *device = pipeline->base.device;
2071    const struct intel_device_info *devinfo = device->info;
2072    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2073 
2074    anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2075 
2076    const struct intel_cs_dispatch_info dispatch =
2077       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
2078    const uint32_t vfe_curbe_allocation =
2079       ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
2080             cs_prog_data->push.cross_thread.regs, 2);
2081 
2082    const struct anv_shader_bin *cs_bin = pipeline->cs;
2083 
2084    anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
2085       vfe.StackSize              = 0;
2086       vfe.MaximumNumberofThreads =
2087          devinfo->max_cs_threads * devinfo->subslice_total - 1;
2088       vfe.NumberofURBEntries     = 2;
2089 #if GFX_VER < 11
2090       vfe.ResetGatewayTimer      = true;
2091 #endif
2092       vfe.URBEntryAllocationSize = 2;
2093       vfe.CURBEAllocationSize    = vfe_curbe_allocation;
2094 
2095       if (cs_bin->prog_data->total_scratch) {
2096          /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
2097           * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
2098           */
2099          vfe.PerThreadScratchSpace =
2100             ffs(cs_bin->prog_data->total_scratch) - 11;
2101          vfe.ScratchSpaceBasePointer =
2102             get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2103       }
2104    }
2105 
2106    struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
2107       .KernelStartPointer     =
2108          cs_bin->kernel.offset +
2109          brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
2110 
2111       /* Wa_1606682166 */
2112       .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
2113       /* We add 1 because the CS indirect parameters buffer isn't accounted
2114        * for in bind_map.surface_count.
2115        *
2116        * Typically set to 0 to avoid prefetching on every thread dispatch.
2117        */
2118       .BindingTableEntryCount = devinfo->verx10 == 125 ?
2119          0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
2120       .BarrierEnable          = cs_prog_data->uses_barrier,
2121       .SharedLocalMemorySize  =
2122          encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
2123 
2124       .ConstantURBEntryReadOffset = 0,
2125       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2126       .CrossThreadConstantDataReadLength =
2127          cs_prog_data->push.cross_thread.regs,
2128 #if GFX_VER >= 12
2129       /* TODO: Check if we are missing workarounds and enable mid-thread
2130        * preemption.
2131        *
2132        * We still have issues with mid-thread preemption (it was already
2133        * disabled by the kernel on gfx11, due to missing workarounds). It's
2134        * possible that we are just missing some workarounds, and could enable
2135        * it later, but for now let's disable it to fix a GPU in compute in Car
2136        * Chase (and possibly more).
2137        */
2138       .ThreadPreemptionDisable = true,
2139 #endif
2140 
2141       .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2142    };
2143    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
2144                                         pipeline->interface_descriptor_data,
2145                                         &desc);
2146 }
2147 
2148 #endif /* #if GFX_VERx10 >= 125 */
2149 
2150 #if GFX_VERx10 >= 125
2151 
2152 void
genX(ray_tracing_pipeline_emit)2153 genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
2154 {
2155    for (uint32_t i = 0; i < pipeline->group_count; i++) {
2156       struct anv_rt_shader_group *group = &pipeline->groups[i];
2157 
2158       switch (group->type) {
2159       case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
2160          struct GENX(RT_GENERAL_SBT_HANDLE) sh = {};
2161          sh.General = anv_shader_bin_get_bsr(group->general, 32);
2162          GENX(RT_GENERAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2163          break;
2164       }
2165 
2166       case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
2167          struct GENX(RT_TRIANGLES_SBT_HANDLE) sh = {};
2168          if (group->closest_hit)
2169             sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2170          if (group->any_hit)
2171             sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
2172          GENX(RT_TRIANGLES_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2173          break;
2174       }
2175 
2176       case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
2177          struct GENX(RT_PROCEDURAL_SBT_HANDLE) sh = {};
2178          if (group->closest_hit)
2179             sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2180          sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
2181          GENX(RT_PROCEDURAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2182          break;
2183       }
2184 
2185       default:
2186          unreachable("Invalid shader group type");
2187       }
2188    }
2189 }
2190 
2191 #else
2192 
2193 void
genX(ray_tracing_pipeline_emit)2194 genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
2195 {
2196    unreachable("Ray tracing not supported");
2197 }
2198 
2199 #endif /* GFX_VERx10 >= 125 */
2200