1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28 #include "genxml/genX_rt_pack.h"
29
30 #include "common/intel_genX_state_brw.h"
31 #include "common/intel_l3_config.h"
32 #include "common/intel_sample_positions.h"
33 #include "nir/nir_xfb_info.h"
34 #include "vk_util.h"
35 #include "vk_format.h"
36 #include "vk_log.h"
37 #include "vk_render_pass.h"
38
39 static inline struct anv_batch *
anv_gfx_pipeline_add(struct anv_graphics_pipeline * pipeline,struct anv_gfx_state_ptr * ptr,uint32_t n_dwords)40 anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
41 struct anv_gfx_state_ptr *ptr,
42 uint32_t n_dwords)
43 {
44 struct anv_batch *batch = &pipeline->base.base.batch;
45
46 assert(ptr->len == 0 ||
47 (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
48 if (ptr->len == 0)
49 ptr->offset = (batch->next - batch->start) / 4;
50 ptr->len += n_dwords;
51
52 return batch;
53 }
54
55 #define anv_pipeline_emit(pipeline, state, cmd, name) \
56 for (struct cmd name = { __anv_cmd_header(cmd) }, \
57 *_dst = anv_batch_emit_dwords( \
58 anv_gfx_pipeline_add(pipeline, \
59 &(pipeline)->state, \
60 __anv_cmd_length(cmd)), \
61 __anv_cmd_length(cmd)); \
62 __builtin_expect(_dst != NULL, 1); \
63 ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch, \
64 _dst, &name); \
65 VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
66 _dst = NULL; \
67 }))
68
69 #define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({ \
70 void *__dst = anv_batch_emit_dwords( \
71 anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n); \
72 if (__dst) { \
73 struct cmd __template = { \
74 __anv_cmd_header(cmd), \
75 .DWordLength = n - __anv_cmd_length_bias(cmd), \
76 __VA_ARGS__ \
77 }; \
78 __anv_cmd_pack(cmd)(&pipeline->base.base.batch, \
79 __dst, &__template); \
80 } \
81 __dst; \
82 })
83
84
85 static uint32_t
vertex_element_comp_control(enum isl_format format,unsigned comp)86 vertex_element_comp_control(enum isl_format format, unsigned comp)
87 {
88 uint8_t bits;
89 switch (comp) {
90 case 0: bits = isl_format_layouts[format].channels.r.bits; break;
91 case 1: bits = isl_format_layouts[format].channels.g.bits; break;
92 case 2: bits = isl_format_layouts[format].channels.b.bits; break;
93 case 3: bits = isl_format_layouts[format].channels.a.bits; break;
94 default: unreachable("Invalid component");
95 }
96
97 /*
98 * Take in account hardware restrictions when dealing with 64-bit floats.
99 *
100 * From Broadwell spec, command reference structures, page 586:
101 * "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
102 * 64-bit components are stored * in the URB without any conversion. In
103 * this case, vertex elements must be written as 128 or 256 bits, with
104 * VFCOMP_STORE_0 being used to pad the output as required. E.g., if
105 * R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
106 * Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
107 * set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
108 * Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
109 * a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
110 * Component 3 to be specified as VFCOMP_STORE_0 in order to output a
111 * 256-bit vertex element."
112 */
113 if (bits) {
114 return VFCOMP_STORE_SRC;
115 } else if (comp >= 2 &&
116 !isl_format_layouts[format].channels.b.bits &&
117 isl_format_layouts[format].channels.r.type == ISL_RAW) {
118 /* When emitting 64-bit attributes, we need to write either 128 or 256
119 * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
120 * VFCOMP_STORE_0 to pad the written chunk */
121 return VFCOMP_NOSTORE;
122 } else if (comp < 3 ||
123 isl_format_layouts[format].channels.r.type == ISL_RAW) {
124 /* Note we need to pad with value 0, not 1, due hardware restrictions
125 * (see comment above) */
126 return VFCOMP_STORE_0;
127 } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
128 isl_format_layouts[format].channels.r.type == ISL_SINT) {
129 assert(comp == 3);
130 return VFCOMP_STORE_1_INT;
131 } else {
132 assert(comp == 3);
133 return VFCOMP_STORE_1_FP;
134 }
135 }
136
137 void
genX(emit_vertex_input)138 genX(emit_vertex_input)(struct anv_batch *batch,
139 uint32_t *vertex_element_dws,
140 struct anv_graphics_pipeline *pipeline,
141 const struct vk_vertex_input_state *vi,
142 bool emit_in_pipeline)
143 {
144 const struct anv_device *device = pipeline->base.base.device;
145 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
146 const uint64_t inputs_read = vs_prog_data->inputs_read;
147 const uint64_t double_inputs_read =
148 vs_prog_data->double_inputs_read & inputs_read;
149 assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
150 const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
151 const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
152
153 for (uint32_t i = 0; i < pipeline->vs_input_elements; i++) {
154 /* The SKL docs for VERTEX_ELEMENT_STATE say:
155 *
156 * "All elements must be valid from Element[0] to the last valid
157 * element. (I.e. if Element[2] is valid then Element[1] and
158 * Element[0] must also be valid)."
159 *
160 * The SKL docs for 3D_Vertex_Component_Control say:
161 *
162 * "Don't store this component. (Not valid for Component 0, but can
163 * be used for Component 1-3)."
164 *
165 * So we can't just leave a vertex element blank and hope for the best.
166 * We have to tell the VF hardware to put something in it; so we just
167 * store a bunch of zero.
168 *
169 * TODO: Compact vertex elements so we never end up with holes.
170 */
171 struct GENX(VERTEX_ELEMENT_STATE) element = {
172 .Valid = true,
173 .Component0Control = VFCOMP_STORE_0,
174 .Component1Control = VFCOMP_STORE_0,
175 .Component2Control = VFCOMP_STORE_0,
176 .Component3Control = VFCOMP_STORE_0,
177 };
178 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
179 &vertex_element_dws[i * 2],
180 &element);
181 }
182
183 u_foreach_bit(a, vi->attributes_valid) {
184 enum isl_format format = anv_get_isl_format(device->info,
185 vi->attributes[a].format,
186 VK_IMAGE_ASPECT_COLOR_BIT,
187 VK_IMAGE_TILING_LINEAR);
188 assume(format < ISL_NUM_FORMATS);
189
190 uint32_t binding = vi->attributes[a].binding;
191 assert(binding < MAX_VBS);
192
193 if ((elements & (1 << a)) == 0)
194 continue; /* Binding unused */
195
196 uint32_t slot =
197 __builtin_popcount(elements & ((1 << a) - 1)) -
198 DIV_ROUND_UP(__builtin_popcount(elements_double &
199 ((1 << a) -1)), 2);
200
201 struct GENX(VERTEX_ELEMENT_STATE) element = {
202 .VertexBufferIndex = vi->attributes[a].binding,
203 .Valid = true,
204 .SourceElementFormat = format,
205 .EdgeFlagEnable = false,
206 .SourceElementOffset = vi->attributes[a].offset,
207 .Component0Control = vertex_element_comp_control(format, 0),
208 .Component1Control = vertex_element_comp_control(format, 1),
209 .Component2Control = vertex_element_comp_control(format, 2),
210 .Component3Control = vertex_element_comp_control(format, 3),
211 };
212 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
213 &vertex_element_dws[slot * 2],
214 &element);
215
216 /* On Broadwell and later, we have a separate VF_INSTANCING packet
217 * that controls instancing. On Haswell and prior, that's part of
218 * VERTEX_BUFFER_STATE which we emit later.
219 */
220 if (emit_in_pipeline) {
221 anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
222 bool per_instance = vi->bindings[binding].input_rate ==
223 VK_VERTEX_INPUT_RATE_INSTANCE;
224 uint32_t divisor = vi->bindings[binding].divisor *
225 pipeline->instance_multiplier;
226
227 vfi.InstancingEnable = per_instance;
228 vfi.VertexElementIndex = slot;
229 vfi.InstanceDataStepRate = per_instance ? divisor : 1;
230 }
231 } else {
232 anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
233 bool per_instance = vi->bindings[binding].input_rate ==
234 VK_VERTEX_INPUT_RATE_INSTANCE;
235 uint32_t divisor = vi->bindings[binding].divisor *
236 pipeline->instance_multiplier;
237
238 vfi.InstancingEnable = per_instance;
239 vfi.VertexElementIndex = slot;
240 vfi.InstanceDataStepRate = per_instance ? divisor : 1;
241 }
242 }
243 }
244 }
245
246 static void
emit_vertex_input(struct anv_graphics_pipeline * pipeline,const struct vk_graphics_pipeline_state * state,const struct vk_vertex_input_state * vi)247 emit_vertex_input(struct anv_graphics_pipeline *pipeline,
248 const struct vk_graphics_pipeline_state *state,
249 const struct vk_vertex_input_state *vi)
250 {
251 /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
252 * everything in gfx8_cmd_buffer.c
253 */
254 if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
255 genX(emit_vertex_input)(NULL,
256 pipeline->vertex_input_data,
257 pipeline, vi, true /* emit_in_pipeline */);
258 }
259
260 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
261 const bool needs_svgs_elem = pipeline->svgs_count > 1 ||
262 !vs_prog_data->uses_drawid;
263 const uint32_t id_slot = pipeline->vs_input_elements;
264 const uint32_t drawid_slot = id_slot + needs_svgs_elem;
265 if (pipeline->svgs_count > 0) {
266 assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
267 uint32_t slot_offset =
268 pipeline->vertex_input_elems - pipeline->svgs_count;
269
270 if (needs_svgs_elem) {
271 #if GFX_VER < 11
272 /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
273 * "Within a VERTEX_ELEMENT_STATE structure, if a Component
274 * Control field is set to something other than VFCOMP_STORE_SRC,
275 * no higher-numbered Component Control fields may be set to
276 * VFCOMP_STORE_SRC"
277 *
278 * This means, that if we have BaseInstance, we need BaseVertex as
279 * well. Just do all or nothing.
280 */
281 uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
282 vs_prog_data->uses_baseinstance) ?
283 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
284 #endif
285
286 struct GENX(VERTEX_ELEMENT_STATE) element = {
287 .VertexBufferIndex = ANV_SVGS_VB_INDEX,
288 .Valid = true,
289 .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
290 #if GFX_VER >= 11
291 /* On gen11, these are taken care of by extra parameter slots */
292 .Component0Control = VFCOMP_STORE_0,
293 .Component1Control = VFCOMP_STORE_0,
294 #else
295 .Component0Control = base_ctrl,
296 .Component1Control = base_ctrl,
297 #endif
298 .Component2Control = VFCOMP_STORE_0,
299 .Component3Control = VFCOMP_STORE_0,
300 };
301 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
302 &pipeline->vertex_input_data[slot_offset * 2],
303 &element);
304 slot_offset++;
305
306 anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
307 GENX(3DSTATE_VF_INSTANCING), vfi) {
308 vfi.VertexElementIndex = id_slot;
309 }
310 }
311
312 if (vs_prog_data->uses_drawid) {
313 struct GENX(VERTEX_ELEMENT_STATE) element = {
314 .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
315 .Valid = true,
316 .SourceElementFormat = ISL_FORMAT_R32_UINT,
317 #if GFX_VER >= 11
318 /* On gen11, this is taken care of by extra parameter slots */
319 .Component0Control = VFCOMP_STORE_0,
320 #else
321 .Component0Control = VFCOMP_STORE_SRC,
322 #endif
323 .Component1Control = VFCOMP_STORE_0,
324 .Component2Control = VFCOMP_STORE_0,
325 .Component3Control = VFCOMP_STORE_0,
326 };
327 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
328 &pipeline->vertex_input_data[slot_offset * 2],
329 &element);
330 slot_offset++;
331
332 anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
333 GENX(3DSTATE_VF_INSTANCING), vfi) {
334 vfi.VertexElementIndex = drawid_slot;
335 }
336 }
337 }
338
339 anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
340 sgvs.VertexIDEnable = vs_prog_data->uses_vertexid;
341 sgvs.VertexIDComponentNumber = 2;
342 sgvs.VertexIDElementOffset = id_slot;
343 sgvs.InstanceIDEnable = vs_prog_data->uses_instanceid;
344 sgvs.InstanceIDComponentNumber = 3;
345 sgvs.InstanceIDElementOffset = id_slot;
346 }
347
348 #if GFX_VER >= 11
349 anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
350 /* gl_BaseVertex */
351 sgvs.XP0Enable = vs_prog_data->uses_firstvertex;
352 sgvs.XP0SourceSelect = XP0_PARAMETER;
353 sgvs.XP0ComponentNumber = 0;
354 sgvs.XP0ElementOffset = id_slot;
355
356 /* gl_BaseInstance */
357 sgvs.XP1Enable = vs_prog_data->uses_baseinstance;
358 sgvs.XP1SourceSelect = StartingInstanceLocation;
359 sgvs.XP1ComponentNumber = 1;
360 sgvs.XP1ElementOffset = id_slot;
361
362 /* gl_DrawID */
363 sgvs.XP2Enable = vs_prog_data->uses_drawid;
364 sgvs.XP2ComponentNumber = 0;
365 sgvs.XP2ElementOffset = drawid_slot;
366 }
367 #endif
368 }
369
370 void
genX(emit_urb_setup)371 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
372 const struct intel_l3_config *l3_config,
373 VkShaderStageFlags active_stages,
374 const struct intel_urb_config *urb_cfg_in,
375 struct intel_urb_config *urb_cfg_out,
376 enum intel_urb_deref_block_size *deref_block_size)
377 {
378 const struct intel_device_info *devinfo = device->info;
379
380 bool constrained;
381 intel_get_urb_config(devinfo, l3_config,
382 active_stages &
383 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
384 active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
385 urb_cfg_out, deref_block_size,
386 &constrained);
387
388 #if INTEL_NEEDS_WA_16014912113
389 if (intel_urb_setup_changed(urb_cfg_in, urb_cfg_out,
390 MESA_SHADER_TESS_EVAL) && urb_cfg_in->size[0] != 0) {
391 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
392 anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
393 urb._3DCommandSubOpcode += i;
394 urb.VSURBStartingAddress = urb_cfg_in->start[i];
395 urb.VSURBEntryAllocationSize = urb_cfg_in->size[i] - 1;
396 urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
397 }
398 }
399 genx_batch_emit_pipe_control(batch, device->info, _3D,
400 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
401 }
402 #endif
403
404 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
405 anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
406 urb._3DCommandSubOpcode += i;
407 urb.VSURBStartingAddress = urb_cfg_out->start[i];
408 urb.VSURBEntryAllocationSize = urb_cfg_out->size[i] - 1;
409 urb.VSNumberofURBEntries = urb_cfg_out->entries[i];
410 }
411 }
412 #if GFX_VERx10 >= 125
413 if (device->vk.enabled_extensions.EXT_mesh_shader) {
414 anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
415 anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
416 }
417 #endif
418 }
419
420 #if GFX_VERx10 >= 125
421 static void
emit_urb_setup_mesh(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)422 emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
423 enum intel_urb_deref_block_size *deref_block_size)
424 {
425 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
426
427 const struct brw_task_prog_data *task_prog_data =
428 anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ?
429 get_task_prog_data(pipeline) : NULL;
430 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
431
432 const struct intel_mesh_urb_allocation alloc =
433 intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config,
434 task_prog_data ? task_prog_data->map.size_dw : 0,
435 mesh_prog_data->map.size_dw);
436
437 /* Zero out the primitive pipeline URB allocations. */
438 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
439 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
440 urb._3DCommandSubOpcode += i;
441 }
442 }
443
444 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
445 if (task_prog_data) {
446 urb.TASKURBEntryAllocationSize = alloc.task_entry_size_64b - 1;
447 urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
448 urb.TASKNumberofURBEntriesSliceN = alloc.task_entries;
449 urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb;
450 urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb;
451 }
452 }
453
454 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
455 urb.MESHURBEntryAllocationSize = alloc.mesh_entry_size_64b - 1;
456 urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
457 urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
458 urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb;
459 urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb;
460 }
461
462 *deref_block_size = alloc.deref_block_size;
463 }
464 #endif
465
466 static void
emit_urb_setup(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)467 emit_urb_setup(struct anv_graphics_pipeline *pipeline,
468 enum intel_urb_deref_block_size *deref_block_size)
469 {
470 #if GFX_VERx10 >= 125
471 if (anv_pipeline_is_mesh(pipeline)) {
472 emit_urb_setup_mesh(pipeline, deref_block_size);
473 return;
474 }
475 #endif
476 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
477 const struct brw_vue_prog_data *prog_data =
478 !anv_pipeline_has_stage(pipeline, i) ? NULL :
479 (const struct brw_vue_prog_data *) pipeline->base.shaders[i]->prog_data;
480
481 pipeline->urb_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
482 }
483
484 struct anv_device *device = pipeline->base.base.device;
485 const struct intel_device_info *devinfo = device->info;
486
487
488 bool constrained;
489 intel_get_urb_config(devinfo,
490 pipeline->base.base.l3_config,
491 pipeline->base.base.active_stages &
492 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
493 pipeline->base.base.active_stages &
494 VK_SHADER_STAGE_GEOMETRY_BIT,
495 &pipeline->urb_cfg, deref_block_size,
496 &constrained);
497
498 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
499 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
500 urb._3DCommandSubOpcode += i;
501 urb.VSURBStartingAddress = pipeline->urb_cfg.start[i];
502 urb.VSURBEntryAllocationSize = pipeline->urb_cfg.size[i] - 1;
503 urb.VSNumberofURBEntries = pipeline->urb_cfg.entries[i];
504 }
505 }
506
507 #if GFX_VERx10 >= 125
508 if (device->vk.enabled_extensions.EXT_mesh_shader) {
509 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
510 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
511 }
512 #endif
513
514 }
515
516 static bool
sbe_primitive_id_override(struct anv_graphics_pipeline * pipeline)517 sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline)
518 {
519 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
520 if (!wm_prog_data)
521 return false;
522
523 const struct intel_vue_map *fs_input_map =
524 &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
525
526 return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
527 fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1;
528 }
529
530 static void
emit_3dstate_sbe(struct anv_graphics_pipeline * pipeline)531 emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
532 {
533 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
534
535 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
536 anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
537 anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
538 #if GFX_VERx10 >= 125
539 if (anv_pipeline_is_mesh(pipeline))
540 anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
541 #endif
542 return;
543 }
544
545 anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
546 anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
547
548 /* TODO(mesh): Figure out cases where we need attribute swizzling. See also
549 * calculate_urb_setup() and related functions.
550 */
551 sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
552 sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
553 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
554 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
555
556 for (unsigned i = 0; i < 32; i++)
557 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
558
559 if (anv_pipeline_is_primitive(pipeline)) {
560 const struct intel_vue_map *fs_input_map =
561 &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
562
563 int first_slot =
564 brw_compute_first_urb_slot_required(wm_prog_data->inputs,
565 fs_input_map);
566 assert(first_slot % 2 == 0);
567 unsigned urb_entry_read_offset = first_slot / 2;
568 int max_source_attr = 0;
569 for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
570 uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
571 int input_index = wm_prog_data->urb_setup[attr];
572
573 assert(0 <= input_index);
574
575 /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
576 * VUE header
577 */
578 if (attr == VARYING_SLOT_VIEWPORT ||
579 attr == VARYING_SLOT_LAYER ||
580 attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
581 continue;
582 }
583
584 if (attr == VARYING_SLOT_PNTC) {
585 sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
586 continue;
587 }
588
589 const int slot = fs_input_map->varying_to_slot[attr];
590
591 if (slot == -1) {
592 /* This attribute does not exist in the VUE--that means that
593 * the vertex shader did not write to it. It could be that it's
594 * a regular varying read by the fragment shader but not
595 * written by the vertex shader or it's gl_PrimitiveID. In the
596 * first case the value is undefined, in the second it needs to
597 * be gl_PrimitiveID.
598 */
599 swiz.Attribute[input_index].ConstantSource = PRIM_ID;
600 swiz.Attribute[input_index].ComponentOverrideX = true;
601 swiz.Attribute[input_index].ComponentOverrideY = true;
602 swiz.Attribute[input_index].ComponentOverrideZ = true;
603 swiz.Attribute[input_index].ComponentOverrideW = true;
604 continue;
605 }
606
607 /* We have to subtract two slots to account for the URB entry
608 * output read offset in the VS and GS stages.
609 */
610 const int source_attr = slot - 2 * urb_entry_read_offset;
611 assert(source_attr >= 0 && source_attr < 32);
612 max_source_attr = MAX2(max_source_attr, source_attr);
613 /* The hardware can only do overrides on 16 overrides at a time,
614 * and the other up to 16 have to be lined up so that the input
615 * index = the output index. We'll need to do some tweaking to
616 * make sure that's the case.
617 */
618 if (input_index < 16)
619 swiz.Attribute[input_index].SourceAttribute = source_attr;
620 else
621 assert(source_attr == input_index);
622 }
623
624 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
625 sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
626 sbe.ForceVertexURBEntryReadOffset = true;
627 sbe.ForceVertexURBEntryReadLength = true;
628
629 /* Ask the hardware to supply PrimitiveID if the fragment shader
630 * reads it but a previous stage didn't write one.
631 */
632 if (sbe_primitive_id_override(pipeline)) {
633 sbe.PrimitiveIDOverrideAttributeSelect =
634 wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
635 sbe.PrimitiveIDOverrideComponentX = true;
636 sbe.PrimitiveIDOverrideComponentY = true;
637 sbe.PrimitiveIDOverrideComponentZ = true;
638 sbe.PrimitiveIDOverrideComponentW = true;
639 }
640 } else {
641 assert(anv_pipeline_is_mesh(pipeline));
642 #if GFX_VERx10 >= 125
643 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
644 anv_pipeline_emit(pipeline, final.sbe_mesh,
645 GENX(3DSTATE_SBE_MESH), sbe_mesh) {
646 const struct brw_mue_map *mue = &mesh_prog_data->map;
647
648 assert(mue->per_vertex_header_size_dw % 8 == 0);
649 sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
650 sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
651
652 /* Clip distance array is passed in the per-vertex header so that
653 * it can be consumed by the HW. If user wants to read it in the
654 * FS, adjust the offset and length to cover it. Conveniently it
655 * is at the end of the per-vertex header, right before per-vertex
656 * attributes.
657 *
658 * Note that FS attribute reading must be aware that the clip
659 * distances have fixed position.
660 */
661 if (mue->per_vertex_header_size_dw > 8 &&
662 (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
663 wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
664 sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
665 sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
666 }
667
668 if (mue->user_data_in_vertex_header) {
669 sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
670 sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
671 }
672
673 assert(mue->per_primitive_header_size_dw % 8 == 0);
674 sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
675 mue->per_primitive_header_size_dw / 8;
676 sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
677 DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
678
679 /* Just like with clip distances, if Primitive Shading Rate,
680 * Viewport Index or Layer is read back in the FS, adjust the
681 * offset and length to cover the Primitive Header, where PSR,
682 * Viewport Index & Layer are stored.
683 */
684 if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
685 wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
686 wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
687 mue->user_data_in_primitive_header) {
688 assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
689 sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
690 sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
691 }
692 }
693 #endif
694 }
695 }
696 }
697 }
698
699 /** Returns the final polygon mode for rasterization
700 *
701 * This function takes into account polygon mode, primitive topology and the
702 * different shader stages which might generate their own type of primitives.
703 */
704 VkPolygonMode
genX(raster_polygon_mode)705 genX(raster_polygon_mode)(const struct anv_graphics_pipeline *pipeline,
706 VkPolygonMode polygon_mode,
707 VkPrimitiveTopology primitive_topology)
708 {
709 if (anv_pipeline_is_mesh(pipeline)) {
710 switch (get_mesh_prog_data(pipeline)->primitive_type) {
711 case MESA_PRIM_POINTS:
712 return VK_POLYGON_MODE_POINT;
713 case MESA_PRIM_LINES:
714 return VK_POLYGON_MODE_LINE;
715 case MESA_PRIM_TRIANGLES:
716 return polygon_mode;
717 default:
718 unreachable("invalid primitive type for mesh");
719 }
720 } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
721 switch (get_gs_prog_data(pipeline)->output_topology) {
722 case _3DPRIM_POINTLIST:
723 return VK_POLYGON_MODE_POINT;
724
725 case _3DPRIM_LINELIST:
726 case _3DPRIM_LINESTRIP:
727 case _3DPRIM_LINELOOP:
728 return VK_POLYGON_MODE_LINE;
729
730 case _3DPRIM_TRILIST:
731 case _3DPRIM_TRIFAN:
732 case _3DPRIM_TRISTRIP:
733 case _3DPRIM_RECTLIST:
734 case _3DPRIM_QUADLIST:
735 case _3DPRIM_QUADSTRIP:
736 case _3DPRIM_POLYGON:
737 return polygon_mode;
738 }
739 unreachable("Unsupported GS output topology");
740 } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
741 switch (get_tes_prog_data(pipeline)->output_topology) {
742 case INTEL_TESS_OUTPUT_TOPOLOGY_POINT:
743 return VK_POLYGON_MODE_POINT;
744
745 case INTEL_TESS_OUTPUT_TOPOLOGY_LINE:
746 return VK_POLYGON_MODE_LINE;
747
748 case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW:
749 case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
750 return polygon_mode;
751 }
752 unreachable("Unsupported TCS output topology");
753 } else {
754 switch (primitive_topology) {
755 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
756 return VK_POLYGON_MODE_POINT;
757
758 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
759 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
760 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
761 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
762 return VK_POLYGON_MODE_LINE;
763
764 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
765 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
766 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
767 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
768 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
769 return polygon_mode;
770
771 default:
772 unreachable("Unsupported primitive topology");
773 }
774 }
775 }
776
777 const uint32_t genX(vk_to_intel_cullmode)[] = {
778 [VK_CULL_MODE_NONE] = CULLMODE_NONE,
779 [VK_CULL_MODE_FRONT_BIT] = CULLMODE_FRONT,
780 [VK_CULL_MODE_BACK_BIT] = CULLMODE_BACK,
781 [VK_CULL_MODE_FRONT_AND_BACK] = CULLMODE_BOTH
782 };
783
784 const uint32_t genX(vk_to_intel_fillmode)[] = {
785 [VK_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
786 [VK_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
787 [VK_POLYGON_MODE_POINT] = FILL_MODE_POINT,
788 };
789
790 const uint32_t genX(vk_to_intel_front_face)[] = {
791 [VK_FRONT_FACE_COUNTER_CLOCKWISE] = 1,
792 [VK_FRONT_FACE_CLOCKWISE] = 0
793 };
794
795 static void
emit_rs_state(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_rasterization_state * rs,const struct vk_multisample_state * ms,const struct vk_render_pass_state * rp,enum intel_urb_deref_block_size urb_deref_block_size)796 emit_rs_state(struct anv_graphics_pipeline *pipeline,
797 const struct vk_input_assembly_state *ia,
798 const struct vk_rasterization_state *rs,
799 const struct vk_multisample_state *ms,
800 const struct vk_render_pass_state *rp,
801 enum intel_urb_deref_block_size urb_deref_block_size)
802 {
803 anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
804 sf.ViewportTransformEnable = true;
805 sf.StatisticsEnable = true;
806 sf.VertexSubPixelPrecisionSelect = _8Bit;
807 sf.AALineDistanceMode = true;
808
809 #if GFX_VER >= 12
810 sf.DerefBlockSize = urb_deref_block_size;
811 #endif
812
813 bool point_from_shader;
814 if (anv_pipeline_is_primitive(pipeline)) {
815 const struct brw_vue_prog_data *last_vue_prog_data =
816 anv_pipeline_get_last_vue_prog_data(pipeline);
817 point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
818 } else {
819 assert(anv_pipeline_is_mesh(pipeline));
820 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
821 point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
822 }
823
824 if (point_from_shader) {
825 sf.PointWidthSource = Vertex;
826 } else {
827 sf.PointWidthSource = State;
828 sf.PointWidth = 1.0;
829 }
830 }
831
832 anv_pipeline_emit(pipeline, partial.raster, GENX(3DSTATE_RASTER), raster) {
833 /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
834 * "Multisample Modes State".
835 */
836 /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
837 * computations. If we ever set this bit to a different value, they will
838 * need to be updated accordingly.
839 */
840 raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
841 raster.ForceMultisampling = false;
842
843 raster.ScissorRectangleEnable = true;
844 }
845 }
846
847 static void
emit_ms_state(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms)848 emit_ms_state(struct anv_graphics_pipeline *pipeline,
849 const struct vk_multisample_state *ms)
850 {
851 anv_pipeline_emit(pipeline, final.ms, GENX(3DSTATE_MULTISAMPLE), ms) {
852 ms.NumberofMultisamples = __builtin_ffs(pipeline->rasterization_samples) - 1;
853
854 ms.PixelLocation = CENTER;
855
856 /* The PRM says that this bit is valid only for DX9:
857 *
858 * SW can choose to set this bit only for DX9 API. DX10/OGL API's
859 * should not have any effect by setting or not setting this bit.
860 */
861 ms.PixelPositionOffsetEnable = false;
862 }
863 }
864
865 const uint32_t genX(vk_to_intel_logic_op)[] = {
866 [VK_LOGIC_OP_COPY] = LOGICOP_COPY,
867 [VK_LOGIC_OP_CLEAR] = LOGICOP_CLEAR,
868 [VK_LOGIC_OP_AND] = LOGICOP_AND,
869 [VK_LOGIC_OP_AND_REVERSE] = LOGICOP_AND_REVERSE,
870 [VK_LOGIC_OP_AND_INVERTED] = LOGICOP_AND_INVERTED,
871 [VK_LOGIC_OP_NO_OP] = LOGICOP_NOOP,
872 [VK_LOGIC_OP_XOR] = LOGICOP_XOR,
873 [VK_LOGIC_OP_OR] = LOGICOP_OR,
874 [VK_LOGIC_OP_NOR] = LOGICOP_NOR,
875 [VK_LOGIC_OP_EQUIVALENT] = LOGICOP_EQUIV,
876 [VK_LOGIC_OP_INVERT] = LOGICOP_INVERT,
877 [VK_LOGIC_OP_OR_REVERSE] = LOGICOP_OR_REVERSE,
878 [VK_LOGIC_OP_COPY_INVERTED] = LOGICOP_COPY_INVERTED,
879 [VK_LOGIC_OP_OR_INVERTED] = LOGICOP_OR_INVERTED,
880 [VK_LOGIC_OP_NAND] = LOGICOP_NAND,
881 [VK_LOGIC_OP_SET] = LOGICOP_SET,
882 };
883
884 const uint32_t genX(vk_to_intel_compare_op)[] = {
885 [VK_COMPARE_OP_NEVER] = PREFILTEROP_NEVER,
886 [VK_COMPARE_OP_LESS] = PREFILTEROP_LESS,
887 [VK_COMPARE_OP_EQUAL] = PREFILTEROP_EQUAL,
888 [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LEQUAL,
889 [VK_COMPARE_OP_GREATER] = PREFILTEROP_GREATER,
890 [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_NOTEQUAL,
891 [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GEQUAL,
892 [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_ALWAYS,
893 };
894
895 const uint32_t genX(vk_to_intel_stencil_op)[] = {
896 [VK_STENCIL_OP_KEEP] = STENCILOP_KEEP,
897 [VK_STENCIL_OP_ZERO] = STENCILOP_ZERO,
898 [VK_STENCIL_OP_REPLACE] = STENCILOP_REPLACE,
899 [VK_STENCIL_OP_INCREMENT_AND_CLAMP] = STENCILOP_INCRSAT,
900 [VK_STENCIL_OP_DECREMENT_AND_CLAMP] = STENCILOP_DECRSAT,
901 [VK_STENCIL_OP_INVERT] = STENCILOP_INVERT,
902 [VK_STENCIL_OP_INCREMENT_AND_WRAP] = STENCILOP_INCR,
903 [VK_STENCIL_OP_DECREMENT_AND_WRAP] = STENCILOP_DECR,
904 };
905
906 const uint32_t genX(vk_to_intel_primitive_type)[] = {
907 [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = _3DPRIM_POINTLIST,
908 [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = _3DPRIM_LINELIST,
909 [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = _3DPRIM_LINESTRIP,
910 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = _3DPRIM_TRILIST,
911 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
912 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
913 [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
914 [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
915 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
916 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
917 };
918
919 static void
emit_3dstate_clip(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)920 emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
921 const struct vk_input_assembly_state *ia,
922 const struct vk_viewport_state *vp,
923 const struct vk_rasterization_state *rs)
924 {
925 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
926 (void) wm_prog_data;
927
928 anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
929 clip.ClipEnable = true;
930 clip.StatisticsEnable = true;
931 clip.EarlyCullEnable = true;
932 clip.GuardbandClipTestEnable = true;
933
934 clip.VertexSubPixelPrecisionSelect = _8Bit;
935 clip.ClipMode = CLIPMODE_NORMAL;
936
937 clip.MinimumPointWidth = 0.125;
938 clip.MaximumPointWidth = 255.875;
939
940 /* TODO(mesh): Multiview. */
941 if (anv_pipeline_is_primitive(pipeline)) {
942 const struct brw_vue_prog_data *last =
943 anv_pipeline_get_last_vue_prog_data(pipeline);
944
945 /* From the Vulkan 1.0.45 spec:
946 *
947 * "If the last active vertex processing stage shader entry
948 * point's interface does not include a variable decorated with
949 * ViewportIndex, then the first viewport is used."
950 */
951 if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
952 clip.MaximumVPIndex = vp->viewport_count > 0 ?
953 vp->viewport_count - 1 : 0;
954 } else {
955 clip.MaximumVPIndex = 0;
956 }
957
958 /* From the Vulkan 1.0.45 spec:
959 *
960 * "If the last active vertex processing stage shader entry point's
961 * interface does not include a variable decorated with Layer, then
962 * the first layer is used."
963 */
964 clip.ForceZeroRTAIndexEnable =
965 !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
966
967 } else if (anv_pipeline_is_mesh(pipeline)) {
968 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
969 if (vp && vp->viewport_count > 0 &&
970 mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
971 clip.MaximumVPIndex = vp->viewport_count - 1;
972 } else {
973 clip.MaximumVPIndex = 0;
974 }
975
976 clip.ForceZeroRTAIndexEnable =
977 mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
978 }
979
980 clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
981 wm_prog_data->uses_nonperspective_interp_modes : 0;
982 }
983
984 #if GFX_VERx10 >= 125
985 if (anv_pipeline_is_mesh(pipeline)) {
986 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
987 anv_pipeline_emit(pipeline, final.clip_mesh,
988 GENX(3DSTATE_CLIP_MESH), clip_mesh) {
989 clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
990 clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
991 clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
992 }
993 }
994 #endif
995 }
996
997 static void
emit_3dstate_streamout(struct anv_graphics_pipeline * pipeline,const struct vk_rasterization_state * rs)998 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
999 const struct vk_rasterization_state *rs)
1000 {
1001 const struct brw_vue_prog_data *prog_data =
1002 anv_pipeline_get_last_vue_prog_data(pipeline);
1003 const struct intel_vue_map *vue_map = &prog_data->vue_map;
1004
1005 nir_xfb_info *xfb_info;
1006 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1007 xfb_info = pipeline->base.shaders[MESA_SHADER_GEOMETRY]->xfb_info;
1008 else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
1009 xfb_info = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
1010 else
1011 xfb_info = pipeline->base.shaders[MESA_SHADER_VERTEX]->xfb_info;
1012
1013 if (xfb_info) {
1014 struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
1015 int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
1016 int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
1017
1018 memset(so_decl, 0, sizeof(so_decl));
1019
1020 for (unsigned i = 0; i < xfb_info->output_count; i++) {
1021 const nir_xfb_output_info *output = &xfb_info->outputs[i];
1022 unsigned buffer = output->buffer;
1023 unsigned stream = xfb_info->buffer_to_stream[buffer];
1024
1025 /* Our hardware is unusual in that it requires us to program SO_DECLs
1026 * for fake "hole" components, rather than simply taking the offset
1027 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
1028 * program as many size = 4 holes as we can, then a final hole to
1029 * accommodate the final 1, 2, or 3 remaining.
1030 */
1031 int hole_dwords = (output->offset - next_offset[buffer]) / 4;
1032 while (hole_dwords > 0) {
1033 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1034 .HoleFlag = 1,
1035 .OutputBufferSlot = buffer,
1036 .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
1037 };
1038 hole_dwords -= 4;
1039 }
1040
1041 int varying = output->location;
1042 uint8_t component_mask = output->component_mask;
1043 /* VARYING_SLOT_PSIZ contains four scalar fields packed together:
1044 * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x
1045 * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y
1046 * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z
1047 * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w
1048 */
1049 if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
1050 varying = VARYING_SLOT_PSIZ;
1051 component_mask = 1 << 0; // SO_DECL_COMPMASK_X
1052 } else if (varying == VARYING_SLOT_LAYER) {
1053 varying = VARYING_SLOT_PSIZ;
1054 component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
1055 } else if (varying == VARYING_SLOT_VIEWPORT) {
1056 varying = VARYING_SLOT_PSIZ;
1057 component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
1058 } else if (varying == VARYING_SLOT_PSIZ) {
1059 component_mask = 1 << 3; // SO_DECL_COMPMASK_W
1060 }
1061
1062 next_offset[buffer] = output->offset +
1063 __builtin_popcount(component_mask) * 4;
1064
1065 const int slot = vue_map->varying_to_slot[varying];
1066 if (slot < 0) {
1067 /* This can happen if the shader never writes to the varying.
1068 * Insert a hole instead of actual varying data.
1069 */
1070 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1071 .HoleFlag = true,
1072 .OutputBufferSlot = buffer,
1073 .ComponentMask = component_mask,
1074 };
1075 } else {
1076 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1077 .OutputBufferSlot = buffer,
1078 .RegisterIndex = slot,
1079 .ComponentMask = component_mask,
1080 };
1081 }
1082 }
1083
1084 int max_decls = 0;
1085 for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
1086 max_decls = MAX2(max_decls, decls[s]);
1087
1088 uint8_t sbs[MAX_XFB_STREAMS] = { };
1089 for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
1090 if (xfb_info->buffers_written & (1 << b))
1091 sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
1092 }
1093
1094 uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
1095 3 + 2 * max_decls,
1096 GENX(3DSTATE_SO_DECL_LIST),
1097 .StreamtoBufferSelects0 = sbs[0],
1098 .StreamtoBufferSelects1 = sbs[1],
1099 .StreamtoBufferSelects2 = sbs[2],
1100 .StreamtoBufferSelects3 = sbs[3],
1101 .NumEntries0 = decls[0],
1102 .NumEntries1 = decls[1],
1103 .NumEntries2 = decls[2],
1104 .NumEntries3 = decls[3]);
1105
1106 for (int i = 0; i < max_decls; i++) {
1107 GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
1108 &(struct GENX(SO_DECL_ENTRY)) {
1109 .Stream0Decl = so_decl[0][i],
1110 .Stream1Decl = so_decl[1][i],
1111 .Stream2Decl = so_decl[2][i],
1112 .Stream3Decl = so_decl[3][i],
1113 });
1114 }
1115 }
1116
1117 anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
1118 if (xfb_info) {
1119 pipeline->uses_xfb = true;
1120
1121 so.SOFunctionEnable = true;
1122 so.SOStatisticsEnable = true;
1123
1124 so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
1125 so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
1126 so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
1127 so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
1128
1129 int urb_entry_read_offset = 0;
1130 int urb_entry_read_length =
1131 (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
1132
1133 /* We always read the whole vertex. This could be reduced at some
1134 * point by reading less and offsetting the register index in the
1135 * SO_DECLs.
1136 */
1137 so.Stream0VertexReadOffset = urb_entry_read_offset;
1138 so.Stream0VertexReadLength = urb_entry_read_length - 1;
1139 so.Stream1VertexReadOffset = urb_entry_read_offset;
1140 so.Stream1VertexReadLength = urb_entry_read_length - 1;
1141 so.Stream2VertexReadOffset = urb_entry_read_offset;
1142 so.Stream2VertexReadLength = urb_entry_read_length - 1;
1143 so.Stream3VertexReadOffset = urb_entry_read_offset;
1144 so.Stream3VertexReadLength = urb_entry_read_length - 1;
1145 }
1146 }
1147 }
1148
1149 static uint32_t
get_sampler_count(const struct anv_shader_bin * bin)1150 get_sampler_count(const struct anv_shader_bin *bin)
1151 {
1152 uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4);
1153
1154 /* We can potentially have way more than 32 samplers and that's ok.
1155 * However, the 3DSTATE_XS packets only have 3 bits to specify how
1156 * many to pre-fetch and all values above 4 are marked reserved.
1157 */
1158 return MIN2(count_by_4, 4);
1159 }
1160
1161 static UNUSED struct anv_address
get_scratch_address(struct anv_pipeline * pipeline,gl_shader_stage stage,const struct anv_shader_bin * bin)1162 get_scratch_address(struct anv_pipeline *pipeline,
1163 gl_shader_stage stage,
1164 const struct anv_shader_bin *bin)
1165 {
1166 return (struct anv_address) {
1167 .bo = anv_scratch_pool_alloc(pipeline->device,
1168 &pipeline->device->scratch_pool,
1169 stage, bin->prog_data->total_scratch),
1170 .offset = 0,
1171 };
1172 }
1173
1174 static UNUSED uint32_t
get_scratch_space(const struct anv_shader_bin * bin)1175 get_scratch_space(const struct anv_shader_bin *bin)
1176 {
1177 return ffs(bin->prog_data->total_scratch / 2048);
1178 }
1179
1180 static UNUSED uint32_t
get_scratch_surf(struct anv_pipeline * pipeline,gl_shader_stage stage,const struct anv_shader_bin * bin)1181 get_scratch_surf(struct anv_pipeline *pipeline,
1182 gl_shader_stage stage,
1183 const struct anv_shader_bin *bin)
1184 {
1185 if (bin->prog_data->total_scratch == 0)
1186 return 0;
1187
1188 struct anv_bo *bo =
1189 anv_scratch_pool_alloc(pipeline->device,
1190 &pipeline->device->scratch_pool,
1191 stage, bin->prog_data->total_scratch);
1192 anv_reloc_list_add_bo(pipeline->batch.relocs, bo);
1193 return anv_scratch_pool_get_surf(pipeline->device,
1194 &pipeline->device->scratch_pool,
1195 bin->prog_data->total_scratch) >> 4;
1196 }
1197
1198 static void
emit_3dstate_vs(struct anv_graphics_pipeline * pipeline)1199 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
1200 {
1201 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1202 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1203 const struct anv_shader_bin *vs_bin =
1204 pipeline->base.shaders[MESA_SHADER_VERTEX];
1205
1206 assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
1207
1208 anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs) {
1209 vs.Enable = true;
1210 vs.StatisticsEnable = true;
1211 vs.KernelStartPointer = vs_bin->kernel.offset;
1212 #if GFX_VER < 20
1213 vs.SIMD8DispatchEnable =
1214 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
1215 #endif
1216
1217 assert(!vs_prog_data->base.base.use_alt_mode);
1218 #if GFX_VER < 11
1219 vs.SingleVertexDispatch = false;
1220 #endif
1221 vs.VectorMaskEnable = false;
1222 /* Wa_1606682166:
1223 * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
1224 * Disable the Sampler state prefetch functionality in the SARB by
1225 * programming 0xB000[30] to '1'.
1226 */
1227 vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
1228 vs.BindingTableEntryCount = vs_bin->bind_map.surface_count;
1229 vs.FloatingPointMode = IEEE754;
1230 vs.IllegalOpcodeExceptionEnable = false;
1231 vs.SoftwareExceptionEnable = false;
1232 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
1233
1234 if (GFX_VER == 9 && devinfo->gt == 4 &&
1235 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1236 /* On Sky Lake GT4, we have experienced some hangs related to the VS
1237 * cache and tessellation. It is unknown exactly what is happening
1238 * but the Haswell docs for the "VS Reference Count Full Force Miss
1239 * Enable" field of the "Thread Mode" register refer to a HSW bug in
1240 * which the VUE handle reference count would overflow resulting in
1241 * internal reference counting bugs. My (Faith's) best guess is that
1242 * this bug cropped back up on SKL GT4 when we suddenly had more
1243 * threads in play than any previous gfx9 hardware.
1244 *
1245 * What we do know for sure is that setting this bit when
1246 * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
1247 * City when playing with DXVK (https://bugs.freedesktop.org/107280).
1248 * Disabling the vertex cache with tessellation shaders should only
1249 * have a minor performance impact as the tessellation shaders are
1250 * likely generating and processing far more geometry than the vertex
1251 * stage.
1252 */
1253 vs.VertexCacheDisable = true;
1254 }
1255
1256 vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length;
1257 vs.VertexURBEntryReadOffset = 0;
1258 vs.DispatchGRFStartRegisterForURBData =
1259 vs_prog_data->base.base.dispatch_grf_start_reg;
1260
1261 vs.UserClipDistanceClipTestEnableBitmask =
1262 vs_prog_data->base.clip_distance_mask;
1263 vs.UserClipDistanceCullTestEnableBitmask =
1264 vs_prog_data->base.cull_distance_mask;
1265
1266 #if GFX_VERx10 >= 125
1267 vs.ScratchSpaceBuffer =
1268 get_scratch_surf(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
1269 #else
1270 vs.PerThreadScratchSpace = get_scratch_space(vs_bin);
1271 vs.ScratchSpaceBasePointer =
1272 get_scratch_address(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
1273 #endif
1274 }
1275 }
1276
1277 static void
emit_3dstate_hs_ds(struct anv_graphics_pipeline * pipeline,const struct vk_tessellation_state * ts)1278 emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
1279 const struct vk_tessellation_state *ts)
1280 {
1281 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1282 anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
1283 anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
1284 return;
1285 }
1286
1287 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1288 const struct anv_shader_bin *tcs_bin =
1289 pipeline->base.shaders[MESA_SHADER_TESS_CTRL];
1290 const struct anv_shader_bin *tes_bin =
1291 pipeline->base.shaders[MESA_SHADER_TESS_EVAL];
1292
1293 const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1294 const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1295
1296 anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs) {
1297 hs.Enable = true;
1298 hs.StatisticsEnable = true;
1299 hs.KernelStartPointer = tcs_bin->kernel.offset;
1300 /* Wa_1606682166 */
1301 hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
1302 hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
1303
1304 #if GFX_VER >= 12
1305 /* Wa_1604578095:
1306 *
1307 * Hang occurs when the number of max threads is less than 2 times
1308 * the number of instance count. The number of max threads must be
1309 * more than 2 times the number of instance count.
1310 */
1311 assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
1312 #endif
1313
1314 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1315 hs.IncludeVertexHandles = true;
1316 hs.InstanceCount = tcs_prog_data->instances - 1;
1317
1318 hs.VertexURBEntryReadLength = 0;
1319 hs.VertexURBEntryReadOffset = 0;
1320 hs.DispatchGRFStartRegisterForURBData =
1321 tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
1322 #if GFX_VER >= 12
1323 hs.DispatchGRFStartRegisterForURBData5 =
1324 tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
1325 #endif
1326
1327 #if GFX_VERx10 >= 125
1328 hs.ScratchSpaceBuffer =
1329 get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
1330 #else
1331 hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1332 hs.ScratchSpaceBasePointer =
1333 get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
1334 #endif
1335
1336 #if GFX_VER == 12
1337 /* Patch Count threshold specifies the maximum number of patches that
1338 * will be accumulated before a thread dispatch is forced.
1339 */
1340 hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
1341 #endif
1342
1343 #if GFX_VER < 20
1344 hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
1345 #endif
1346 hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
1347 };
1348
1349 anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds) {
1350 ds.Enable = true;
1351 ds.StatisticsEnable = true;
1352 ds.KernelStartPointer = tes_bin->kernel.offset;
1353 /* Wa_1606682166 */
1354 ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
1355 ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
1356 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1357
1358 ds.ComputeWCoordinateEnable =
1359 tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
1360
1361 ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1362 ds.PatchURBEntryReadOffset = 0;
1363 ds.DispatchGRFStartRegisterForURBData =
1364 tes_prog_data->base.base.dispatch_grf_start_reg;
1365
1366 #if GFX_VER < 11
1367 ds.DispatchMode =
1368 tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1369 DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1370 DISPATCH_MODE_SIMD4X2;
1371 #else
1372 assert(tes_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
1373 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
1374 #endif
1375
1376 ds.UserClipDistanceClipTestEnableBitmask =
1377 tes_prog_data->base.clip_distance_mask;
1378 ds.UserClipDistanceCullTestEnableBitmask =
1379 tes_prog_data->base.cull_distance_mask;
1380
1381 #if GFX_VER >= 12
1382 ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
1383 #endif
1384 #if GFX_VERx10 >= 125
1385 ds.ScratchSpaceBuffer =
1386 get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
1387 #else
1388 ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1389 ds.ScratchSpaceBasePointer =
1390 get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
1391 #endif
1392 }
1393 }
1394
1395 static UNUSED bool
geom_or_tess_prim_id_used(struct anv_graphics_pipeline * pipeline)1396 geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
1397 {
1398 const struct brw_tcs_prog_data *tcs_prog_data =
1399 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ?
1400 get_tcs_prog_data(pipeline) : NULL;
1401 const struct brw_tes_prog_data *tes_prog_data =
1402 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ?
1403 get_tes_prog_data(pipeline) : NULL;
1404 const struct brw_gs_prog_data *gs_prog_data =
1405 anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) ?
1406 get_gs_prog_data(pipeline) : NULL;
1407
1408 return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
1409 (tes_prog_data && tes_prog_data->include_primitive_id) ||
1410 (gs_prog_data && gs_prog_data->include_primitive_id);
1411 }
1412
1413 static void
emit_3dstate_te(struct anv_graphics_pipeline * pipeline)1414 emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
1415 {
1416 anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
1417 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1418 const struct brw_tes_prog_data *tes_prog_data =
1419 get_tes_prog_data(pipeline);
1420
1421 te.Partitioning = tes_prog_data->partitioning;
1422 te.TEDomain = tes_prog_data->domain;
1423 te.TEEnable = true;
1424 te.MaximumTessellationFactorOdd = 63.0;
1425 te.MaximumTessellationFactorNotOdd = 64.0;
1426 #if GFX_VERx10 >= 125
1427 const struct anv_device *device = pipeline->base.base.device;
1428 if (intel_needs_workaround(device->info, 22012699309))
1429 te.TessellationDistributionMode = TEDMODE_RR_STRICT;
1430 else
1431 te.TessellationDistributionMode = TEDMODE_RR_FREE;
1432
1433 if (intel_needs_workaround(device->info, 14015055625)) {
1434 /* Wa_14015055625:
1435 *
1436 * Disable Tessellation Distribution when primitive Id is enabled.
1437 */
1438 if (sbe_primitive_id_override(pipeline) ||
1439 geom_or_tess_prim_id_used(pipeline))
1440 te.TessellationDistributionMode = TEDMODE_OFF;
1441 }
1442
1443 #if GFX_VER >= 20
1444 te.TessellationDistributionLevel = TEDLEVEL_REGION;
1445 #else
1446 te.TessellationDistributionLevel = TEDLEVEL_PATCH;
1447 #endif
1448 /* 64_TRIANGLES */
1449 te.SmallPatchThreshold = 3;
1450 /* 1K_TRIANGLES */
1451 te.TargetBlockSize = 8;
1452 /* 1K_TRIANGLES */
1453 te.LocalBOPAccumulatorThreshold = 1;
1454 #endif
1455 }
1456 }
1457 }
1458
1459 static void
emit_3dstate_gs(struct anv_graphics_pipeline * pipeline)1460 emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
1461 {
1462 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1463 anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
1464 return;
1465 }
1466
1467 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1468 const struct anv_shader_bin *gs_bin =
1469 pipeline->base.shaders[MESA_SHADER_GEOMETRY];
1470 const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
1471
1472 anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs) {
1473 gs.Enable = true;
1474 gs.StatisticsEnable = true;
1475 gs.KernelStartPointer = gs_bin->kernel.offset;
1476 #if GFX_VER < 20
1477 gs.DispatchMode = gs_prog_data->base.dispatch_mode;
1478 #endif
1479
1480 gs.SingleProgramFlow = false;
1481 gs.VectorMaskEnable = false;
1482 /* Wa_1606682166 */
1483 gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
1484 gs.BindingTableEntryCount = gs_bin->bind_map.surface_count;
1485 gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles;
1486 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
1487
1488 gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
1489
1490 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
1491 gs.OutputTopology = gs_prog_data->output_topology;
1492 gs.ControlDataFormat = gs_prog_data->control_data_format;
1493 gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords;
1494 gs.InstanceControl = MAX2(gs_prog_data->invocations, 1) - 1;
1495
1496 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
1497 gs.StaticOutput = gs_prog_data->static_vertex_count >= 0;
1498 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
1499 gs_prog_data->static_vertex_count : 0;
1500
1501 gs.VertexURBEntryReadOffset = 0;
1502 gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1503 gs.DispatchGRFStartRegisterForURBData =
1504 gs_prog_data->base.base.dispatch_grf_start_reg;
1505
1506 gs.UserClipDistanceClipTestEnableBitmask =
1507 gs_prog_data->base.clip_distance_mask;
1508 gs.UserClipDistanceCullTestEnableBitmask =
1509 gs_prog_data->base.cull_distance_mask;
1510
1511 #if GFX_VERx10 >= 125
1512 gs.ScratchSpaceBuffer =
1513 get_scratch_surf(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
1514 #else
1515 gs.PerThreadScratchSpace = get_scratch_space(gs_bin);
1516 gs.ScratchSpaceBasePointer =
1517 get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
1518 #endif
1519 }
1520 }
1521
1522 static void
emit_3dstate_wm(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_rasterization_state * rs,const struct vk_multisample_state * ms,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)1523 emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
1524 const struct vk_input_assembly_state *ia,
1525 const struct vk_rasterization_state *rs,
1526 const struct vk_multisample_state *ms,
1527 const struct vk_color_blend_state *cb,
1528 const struct vk_render_pass_state *rp)
1529 {
1530 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1531
1532 anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
1533 wm.StatisticsEnable = true;
1534 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1535 wm.LineAntialiasingRegionWidth = _10pixels;
1536 wm.PointRasterizationRule = RASTRULE_UPPER_LEFT;
1537
1538 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1539 if (wm_prog_data->early_fragment_tests) {
1540 wm.EarlyDepthStencilControl = EDSC_PREPS;
1541 } else if (wm_prog_data->has_side_effects) {
1542 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
1543 } else {
1544 wm.EarlyDepthStencilControl = EDSC_NORMAL;
1545 }
1546
1547 /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
1548 * doesn't take into account KillPixels when no depth or stencil
1549 * writes are enabled. In order for occlusion queries to work
1550 * correctly with no attachments, we need to force-enable PS thread
1551 * dispatch.
1552 *
1553 * The BDW docs are pretty clear that that this bit isn't validated
1554 * and probably shouldn't be used in production:
1555 *
1556 * "This must always be set to Normal. This field should not be
1557 * tested for functional validation."
1558 *
1559 * Unfortunately, however, the other mechanism we have for doing this
1560 * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
1561 * Given two bad options, we choose the one which works.
1562 */
1563 pipeline->force_fragment_thread_dispatch =
1564 wm_prog_data->has_side_effects ||
1565 wm_prog_data->uses_kill;
1566
1567 wm.BarycentricInterpolationMode =
1568 wm_prog_data_barycentric_modes(wm_prog_data,
1569 pipeline->fs_msaa_flags);
1570 }
1571 }
1572 }
1573
1574 static void
emit_3dstate_ps(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms,const struct vk_color_blend_state * cb)1575 emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
1576 const struct vk_multisample_state *ms,
1577 const struct vk_color_blend_state *cb)
1578 {
1579 UNUSED const struct intel_device_info *devinfo =
1580 pipeline->base.base.device->info;
1581 const struct anv_shader_bin *fs_bin =
1582 pipeline->base.shaders[MESA_SHADER_FRAGMENT];
1583
1584 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1585 anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps);
1586 return;
1587 }
1588
1589 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1590
1591 anv_pipeline_emit(pipeline, final.ps, GENX(3DSTATE_PS), ps) {
1592 intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data,
1593 ms != NULL ? ms->rasterization_samples : 1,
1594 pipeline->fs_msaa_flags);
1595
1596 const bool persample =
1597 brw_wm_prog_data_is_persample(wm_prog_data, pipeline->fs_msaa_flags);
1598
1599 #if GFX_VER == 12
1600 assert(wm_prog_data->dispatch_multi == 0 ||
1601 (wm_prog_data->dispatch_multi == 16 && wm_prog_data->max_polygons == 2));
1602 ps.DualSIMD8DispatchEnable = wm_prog_data->dispatch_multi;
1603 /* XXX - No major improvement observed from enabling
1604 * overlapping subspans, but it could be helpful
1605 * in theory when the requirements listed on the
1606 * BSpec page for 3DSTATE_PS_BODY are met.
1607 */
1608 ps.OverlappingSubspansEnable = false;
1609 #endif
1610
1611 ps.KernelStartPointer0 = fs_bin->kernel.offset +
1612 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
1613 ps.KernelStartPointer1 = fs_bin->kernel.offset +
1614 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
1615 #if GFX_VER < 20
1616 ps.KernelStartPointer2 = fs_bin->kernel.offset +
1617 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
1618 #endif
1619
1620 ps.SingleProgramFlow = false;
1621 ps.VectorMaskEnable = wm_prog_data->uses_vmask;
1622 /* Wa_1606682166 */
1623 ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
1624 ps.BindingTableEntryCount = fs_bin->bind_map.surface_count;
1625 #if GFX_VER < 20
1626 ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 ||
1627 wm_prog_data->base.ubo_ranges[0].length;
1628 #endif
1629 ps.PositionXYOffsetSelect =
1630 !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
1631 persample ? POSOFFSET_SAMPLE : POSOFFSET_CENTROID;
1632
1633 ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
1634
1635 ps.DispatchGRFStartRegisterForConstantSetupData0 =
1636 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
1637 ps.DispatchGRFStartRegisterForConstantSetupData1 =
1638 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
1639 #if GFX_VER < 20
1640 ps.DispatchGRFStartRegisterForConstantSetupData2 =
1641 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
1642 #endif
1643
1644 #if GFX_VERx10 >= 125
1645 ps.ScratchSpaceBuffer =
1646 get_scratch_surf(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
1647 #else
1648 ps.PerThreadScratchSpace = get_scratch_space(fs_bin);
1649 ps.ScratchSpaceBasePointer =
1650 get_scratch_address(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
1651 #endif
1652 }
1653 }
1654
1655 static void
emit_3dstate_ps_extra(struct anv_graphics_pipeline * pipeline,const struct vk_rasterization_state * rs,const struct vk_graphics_pipeline_state * state)1656 emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
1657 const struct vk_rasterization_state *rs,
1658 const struct vk_graphics_pipeline_state *state)
1659 {
1660 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1661
1662 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1663 anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
1664 return;
1665 }
1666
1667 anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
1668 ps.PixelShaderValid = true;
1669 #if GFX_VER < 20
1670 ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;
1671 #endif
1672 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1673 ps.PixelShaderIsPerSample =
1674 brw_wm_prog_data_is_persample(wm_prog_data, pipeline->fs_msaa_flags);
1675 ps.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
1676 ps.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
1677 ps.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1678
1679 ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
1680 #if GFX_VER >= 20
1681 assert(!wm_prog_data->pulls_bary);
1682 #else
1683 ps.PixelShaderPullsBary = wm_prog_data->pulls_bary;
1684 #endif
1685
1686 ps.InputCoverageMaskState = ICMS_NONE;
1687 assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
1688 if (!wm_prog_data->uses_sample_mask)
1689 ps.InputCoverageMaskState = ICMS_NONE;
1690 else if (brw_wm_prog_data_is_coarse(wm_prog_data, 0))
1691 ps.InputCoverageMaskState = ICMS_NORMAL;
1692 else if (wm_prog_data->post_depth_coverage)
1693 ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
1694 else
1695 ps.InputCoverageMaskState = ICMS_NORMAL;
1696
1697 #if GFX_VER >= 11
1698 ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
1699 wm_prog_data->uses_depth_w_coefficients;
1700 ps.PixelShaderIsPerCoarsePixel =
1701 brw_wm_prog_data_is_coarse(wm_prog_data, pipeline->fs_msaa_flags);
1702 #endif
1703 #if GFX_VERx10 >= 125
1704 /* TODO: We should only require this when the last geometry shader uses
1705 * a fragment shading rate that is not constant.
1706 */
1707 ps.EnablePSDependencyOnCPsizeChange =
1708 brw_wm_prog_data_is_coarse(wm_prog_data, pipeline->fs_msaa_flags);
1709 #endif
1710 }
1711 }
1712
1713 static void
emit_3dstate_vf_statistics(struct anv_graphics_pipeline * pipeline)1714 emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
1715 {
1716 anv_pipeline_emit(pipeline, final.vf_statistics,
1717 GENX(3DSTATE_VF_STATISTICS), vfs) {
1718 vfs.StatisticsEnable = true;
1719 }
1720 }
1721
1722 static void
compute_kill_pixel(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms,const struct vk_graphics_pipeline_state * state)1723 compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
1724 const struct vk_multisample_state *ms,
1725 const struct vk_graphics_pipeline_state *state)
1726 {
1727 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1728 pipeline->kill_pixel = false;
1729 return;
1730 }
1731
1732 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1733
1734 /* This computes the KillPixel portion of the computation for whether or
1735 * not we want to enable the PMA fix on gfx8 or gfx9. It's given by this
1736 * chunk of the giant formula:
1737 *
1738 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1739 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1740 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1741 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1742 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1743 *
1744 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
1745 * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
1746 * of an alpha test.
1747 */
1748 pipeline->rp_has_ds_self_dep =
1749 (state->pipeline_flags &
1750 VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) != 0;
1751 pipeline->kill_pixel =
1752 pipeline->rp_has_ds_self_dep ||
1753 wm_prog_data->uses_kill ||
1754 wm_prog_data->uses_omask ||
1755 (ms && ms->alpha_to_coverage_enable);
1756 }
1757
1758 #if GFX_VER >= 12
1759 static void
emit_3dstate_primitive_replication(struct anv_graphics_pipeline * pipeline,const struct vk_render_pass_state * rp)1760 emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
1761 const struct vk_render_pass_state *rp)
1762 {
1763 if (anv_pipeline_is_mesh(pipeline)) {
1764 anv_pipeline_emit(pipeline, final.primitive_replication,
1765 GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1766 return;
1767 }
1768
1769 const int replication_count =
1770 anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots;
1771
1772 assert(replication_count >= 1);
1773 if (replication_count == 1) {
1774 anv_pipeline_emit(pipeline, final.primitive_replication,
1775 GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1776 return;
1777 }
1778
1779 assert(replication_count == util_bitcount(rp->view_mask));
1780 assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
1781
1782 anv_pipeline_emit(pipeline, final.primitive_replication,
1783 GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
1784 pr.ReplicaMask = (1 << replication_count) - 1;
1785 pr.ReplicationCount = replication_count - 1;
1786
1787 int i = 0;
1788 u_foreach_bit(view_index, rp->view_mask) {
1789 pr.RTAIOffset[i] = view_index;
1790 i++;
1791 }
1792 }
1793 }
1794 #endif
1795
1796 #if GFX_VERx10 >= 125
1797 static void
emit_task_state(struct anv_graphics_pipeline * pipeline)1798 emit_task_state(struct anv_graphics_pipeline *pipeline)
1799 {
1800 assert(anv_pipeline_is_mesh(pipeline));
1801
1802 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
1803 anv_pipeline_emit(pipeline, final.task_control,
1804 GENX(3DSTATE_TASK_CONTROL), zero);
1805 anv_pipeline_emit(pipeline, final.task_shader,
1806 GENX(3DSTATE_TASK_SHADER), zero);
1807 anv_pipeline_emit(pipeline, final.task_redistrib,
1808 GENX(3DSTATE_TASK_REDISTRIB), zero);
1809 return;
1810 }
1811
1812 const struct anv_shader_bin *task_bin =
1813 pipeline->base.shaders[MESA_SHADER_TASK];
1814
1815 anv_pipeline_emit(pipeline, final.task_control,
1816 GENX(3DSTATE_TASK_CONTROL), tc) {
1817 tc.TaskShaderEnable = true;
1818 tc.ScratchSpaceBuffer =
1819 get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin);
1820 tc.MaximumNumberofThreadGroups = 511;
1821 }
1822
1823 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1824 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
1825 const struct intel_cs_dispatch_info task_dispatch =
1826 brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
1827
1828 anv_pipeline_emit(pipeline, final.task_shader,
1829 GENX(3DSTATE_TASK_SHADER), task) {
1830 task.KernelStartPointer = task_bin->kernel.offset;
1831 task.SIMDSize = task_dispatch.simd_size / 16;
1832 task.MessageSIMD = task.SIMDSize;
1833 task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads;
1834 task.ExecutionMask = task_dispatch.right_mask;
1835 task.LocalXMaximum = task_dispatch.group_size - 1;
1836 task.EmitLocalIDX = true;
1837
1838 task.NumberofBarriers = task_prog_data->base.uses_barrier;
1839 task.SharedLocalMemorySize =
1840 encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared);
1841 task.PreferredSLMAllocationSize =
1842 preferred_slm_allocation_size(devinfo);
1843
1844 /*
1845 * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
1846 * of a buffer with push constants and descriptor set table and
1847 * InlineData[2:7] will be used for first few push constants.
1848 */
1849 task.EmitInlineParameter = true;
1850
1851 task.XP0Required = task_prog_data->uses_drawid;
1852 }
1853
1854 /* Recommended values from "Task and Mesh Distribution Programming". */
1855 anv_pipeline_emit(pipeline, final.task_redistrib,
1856 GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
1857 redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
1858 redistrib.SmallTaskThreshold = 1; /* 2^N */
1859 redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
1860 redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM;
1861 redistrib.TaskRedistributionMode = TASKREDISTRIB_RR_STRICT;
1862 }
1863 }
1864
1865 static void
emit_mesh_state(struct anv_graphics_pipeline * pipeline)1866 emit_mesh_state(struct anv_graphics_pipeline *pipeline)
1867 {
1868 assert(anv_pipeline_is_mesh(pipeline));
1869
1870 const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];
1871
1872 anv_pipeline_emit(pipeline, final.mesh_control,
1873 GENX(3DSTATE_MESH_CONTROL), mc) {
1874 mc.MeshShaderEnable = true;
1875 mc.ScratchSpaceBuffer =
1876 get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin);
1877 mc.MaximumNumberofThreadGroups = 511;
1878 }
1879
1880 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1881 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
1882 const struct intel_cs_dispatch_info mesh_dispatch =
1883 brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL);
1884
1885 const unsigned output_topology =
1886 mesh_prog_data->primitive_type == MESA_PRIM_POINTS ? OUTPUT_POINT :
1887 mesh_prog_data->primitive_type == MESA_PRIM_LINES ? OUTPUT_LINE :
1888 OUTPUT_TRI;
1889
1890 uint32_t index_format;
1891 switch (mesh_prog_data->index_format) {
1892 case BRW_INDEX_FORMAT_U32:
1893 index_format = INDEX_U32;
1894 break;
1895 case BRW_INDEX_FORMAT_U888X:
1896 index_format = INDEX_U888X;
1897 break;
1898 default:
1899 unreachable("invalid index format");
1900 }
1901
1902 anv_pipeline_emit(pipeline, final.mesh_shader,
1903 GENX(3DSTATE_MESH_SHADER), mesh) {
1904 mesh.KernelStartPointer = mesh_bin->kernel.offset;
1905 mesh.SIMDSize = mesh_dispatch.simd_size / 16;
1906 mesh.MessageSIMD = mesh.SIMDSize;
1907 mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads;
1908 mesh.ExecutionMask = mesh_dispatch.right_mask;
1909 mesh.LocalXMaximum = mesh_dispatch.group_size - 1;
1910 mesh.EmitLocalIDX = true;
1911
1912 mesh.MaximumPrimitiveCount = MAX2(mesh_prog_data->map.max_primitives, 1) - 1;
1913 mesh.OutputTopology = output_topology;
1914 mesh.PerVertexDataPitch = mesh_prog_data->map.per_vertex_pitch_dw / 8;
1915 mesh.PerPrimitiveDataPresent = mesh_prog_data->map.per_primitive_pitch_dw > 0;
1916 mesh.PerPrimitiveDataPitch = mesh_prog_data->map.per_primitive_pitch_dw / 8;
1917 mesh.IndexFormat = index_format;
1918
1919 mesh.NumberofBarriers = mesh_prog_data->base.uses_barrier;
1920 mesh.SharedLocalMemorySize =
1921 encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared);
1922 mesh.PreferredSLMAllocationSize =
1923 preferred_slm_allocation_size(devinfo);
1924
1925 /*
1926 * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address
1927 * of a buffer with push constants and descriptor set table and
1928 * InlineData[2:7] will be used for first few push constants.
1929 */
1930 mesh.EmitInlineParameter = true;
1931
1932 mesh.XP0Required = mesh_prog_data->uses_drawid;
1933 }
1934
1935 /* Recommended values from "Task and Mesh Distribution Programming". */
1936 anv_pipeline_emit(pipeline, final.mesh_distrib,
1937 GENX(3DSTATE_MESH_DISTRIB), distrib) {
1938 distrib.DistributionMode = MESH_RR_FREE;
1939 distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
1940 distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
1941 }
1942 }
1943 #endif
1944
1945 void
genX(graphics_pipeline_emit)1946 genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
1947 const struct vk_graphics_pipeline_state *state)
1948 {
1949 enum intel_urb_deref_block_size urb_deref_block_size;
1950 emit_urb_setup(pipeline, &urb_deref_block_size);
1951
1952 emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp,
1953 urb_deref_block_size);
1954 emit_ms_state(pipeline, state->ms);
1955 compute_kill_pixel(pipeline, state->ms, state);
1956
1957 emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs);
1958
1959 #if GFX_VER >= 12
1960 emit_3dstate_primitive_replication(pipeline, state->rp);
1961 #endif
1962
1963 #if GFX_VERx10 >= 125
1964 anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
1965 /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
1966 vfg.DistributionMode =
1967 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT :
1968 RR_FREE;
1969 vfg.DistributionGranularity = BatchLevelGranularity;
1970 #if INTEL_WA_14014851047_GFX_VER
1971 vfg.GranularityThresholdDisable =
1972 intel_needs_workaround(pipeline->base.base.device->info, 14014851047);
1973 #endif
1974 /* 192 vertices for TRILIST_ADJ */
1975 vfg.ListNBatchSizeScale = 0;
1976 /* Batch size of 384 vertices */
1977 vfg.List3BatchSizeScale = 2;
1978 /* Batch size of 128 vertices */
1979 vfg.List2BatchSizeScale = 1;
1980 /* Batch size of 128 vertices */
1981 vfg.List1BatchSizeScale = 2;
1982 /* Batch size of 256 vertices for STRIP topologies */
1983 vfg.StripBatchSizeScale = 3;
1984 /* 192 control points for PATCHLIST_3 */
1985 vfg.PatchBatchSizeScale = 1;
1986 /* 192 control points for PATCHLIST_3 */
1987 vfg.PatchBatchSizeMultiplier = 31;
1988 }
1989 #endif
1990
1991 emit_3dstate_vf_statistics(pipeline);
1992
1993 if (anv_pipeline_is_primitive(pipeline)) {
1994 emit_vertex_input(pipeline, state, state->vi);
1995
1996 emit_3dstate_vs(pipeline);
1997 emit_3dstate_hs_ds(pipeline, state->ts);
1998 emit_3dstate_te(pipeline);
1999 emit_3dstate_gs(pipeline);
2000
2001 emit_3dstate_streamout(pipeline, state->rs);
2002
2003 #if GFX_VERx10 >= 125
2004 const struct anv_device *device = pipeline->base.base.device;
2005 /* Disable Mesh. */
2006 if (device->vk.enabled_extensions.EXT_mesh_shader) {
2007 anv_pipeline_emit(pipeline, final.mesh_control,
2008 GENX(3DSTATE_MESH_CONTROL), zero);
2009 anv_pipeline_emit(pipeline, final.mesh_shader,
2010 GENX(3DSTATE_MESH_SHADER), zero);
2011 anv_pipeline_emit(pipeline, final.mesh_distrib,
2012 GENX(3DSTATE_MESH_DISTRIB), zero);
2013 anv_pipeline_emit(pipeline, final.clip_mesh,
2014 GENX(3DSTATE_CLIP_MESH), zero);
2015 anv_pipeline_emit(pipeline, final.sbe_mesh,
2016 GENX(3DSTATE_SBE_MESH), zero);
2017 anv_pipeline_emit(pipeline, final.task_control,
2018 GENX(3DSTATE_TASK_CONTROL), zero);
2019 anv_pipeline_emit(pipeline, final.task_shader,
2020 GENX(3DSTATE_TASK_SHADER), zero);
2021 anv_pipeline_emit(pipeline, final.task_redistrib,
2022 GENX(3DSTATE_TASK_REDISTRIB), zero);
2023 }
2024 #endif
2025 } else {
2026 assert(anv_pipeline_is_mesh(pipeline));
2027
2028 anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs);
2029 #if GFX_VER >= 11
2030 anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs);
2031 #endif
2032 anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs);
2033 anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
2034 anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
2035 anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te);
2036 anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
2037
2038 /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
2039 * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
2040 */
2041 anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);
2042
2043 #if GFX_VERx10 >= 125
2044 emit_task_state(pipeline);
2045 emit_mesh_state(pipeline);
2046 #endif
2047 }
2048
2049 emit_3dstate_sbe(pipeline);
2050 emit_3dstate_wm(pipeline, state->ia, state->rs,
2051 state->ms, state->cb, state->rp);
2052 emit_3dstate_ps(pipeline, state->ms, state->cb);
2053 emit_3dstate_ps_extra(pipeline, state->rs, state);
2054 }
2055
2056 #if GFX_VERx10 >= 125
2057
2058 void
genX(compute_pipeline_emit)2059 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
2060 {
2061 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2062 anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2063 }
2064
2065 #else /* #if GFX_VERx10 >= 125 */
2066
2067 void
genX(compute_pipeline_emit)2068 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
2069 {
2070 struct anv_device *device = pipeline->base.device;
2071 const struct intel_device_info *devinfo = device->info;
2072 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2073
2074 anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2075
2076 const struct intel_cs_dispatch_info dispatch =
2077 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
2078 const uint32_t vfe_curbe_allocation =
2079 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
2080 cs_prog_data->push.cross_thread.regs, 2);
2081
2082 const struct anv_shader_bin *cs_bin = pipeline->cs;
2083
2084 anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
2085 vfe.StackSize = 0;
2086 vfe.MaximumNumberofThreads =
2087 devinfo->max_cs_threads * devinfo->subslice_total - 1;
2088 vfe.NumberofURBEntries = 2;
2089 #if GFX_VER < 11
2090 vfe.ResetGatewayTimer = true;
2091 #endif
2092 vfe.URBEntryAllocationSize = 2;
2093 vfe.CURBEAllocationSize = vfe_curbe_allocation;
2094
2095 if (cs_bin->prog_data->total_scratch) {
2096 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
2097 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
2098 */
2099 vfe.PerThreadScratchSpace =
2100 ffs(cs_bin->prog_data->total_scratch) - 11;
2101 vfe.ScratchSpaceBasePointer =
2102 get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2103 }
2104 }
2105
2106 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
2107 .KernelStartPointer =
2108 cs_bin->kernel.offset +
2109 brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
2110
2111 /* Wa_1606682166 */
2112 .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
2113 /* We add 1 because the CS indirect parameters buffer isn't accounted
2114 * for in bind_map.surface_count.
2115 *
2116 * Typically set to 0 to avoid prefetching on every thread dispatch.
2117 */
2118 .BindingTableEntryCount = devinfo->verx10 == 125 ?
2119 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
2120 .BarrierEnable = cs_prog_data->uses_barrier,
2121 .SharedLocalMemorySize =
2122 encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
2123
2124 .ConstantURBEntryReadOffset = 0,
2125 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2126 .CrossThreadConstantDataReadLength =
2127 cs_prog_data->push.cross_thread.regs,
2128 #if GFX_VER >= 12
2129 /* TODO: Check if we are missing workarounds and enable mid-thread
2130 * preemption.
2131 *
2132 * We still have issues with mid-thread preemption (it was already
2133 * disabled by the kernel on gfx11, due to missing workarounds). It's
2134 * possible that we are just missing some workarounds, and could enable
2135 * it later, but for now let's disable it to fix a GPU in compute in Car
2136 * Chase (and possibly more).
2137 */
2138 .ThreadPreemptionDisable = true,
2139 #endif
2140
2141 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2142 };
2143 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
2144 pipeline->interface_descriptor_data,
2145 &desc);
2146 }
2147
2148 #endif /* #if GFX_VERx10 >= 125 */
2149
2150 #if GFX_VERx10 >= 125
2151
2152 void
genX(ray_tracing_pipeline_emit)2153 genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
2154 {
2155 for (uint32_t i = 0; i < pipeline->group_count; i++) {
2156 struct anv_rt_shader_group *group = &pipeline->groups[i];
2157
2158 switch (group->type) {
2159 case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
2160 struct GENX(RT_GENERAL_SBT_HANDLE) sh = {};
2161 sh.General = anv_shader_bin_get_bsr(group->general, 32);
2162 GENX(RT_GENERAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2163 break;
2164 }
2165
2166 case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
2167 struct GENX(RT_TRIANGLES_SBT_HANDLE) sh = {};
2168 if (group->closest_hit)
2169 sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2170 if (group->any_hit)
2171 sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
2172 GENX(RT_TRIANGLES_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2173 break;
2174 }
2175
2176 case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
2177 struct GENX(RT_PROCEDURAL_SBT_HANDLE) sh = {};
2178 if (group->closest_hit)
2179 sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2180 sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
2181 GENX(RT_PROCEDURAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2182 break;
2183 }
2184
2185 default:
2186 unreachable("Invalid shader group type");
2187 }
2188 }
2189 }
2190
2191 #else
2192
2193 void
genX(ray_tracing_pipeline_emit)2194 genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
2195 {
2196 unreachable("Ray tracing not supported");
2197 }
2198
2199 #endif /* GFX_VERx10 >= 125 */
2200