1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28 #include "genxml/genX_rt_pack.h"
29
30 #include "common/intel_compute_slm.h"
31 #include "common/intel_genX_state_brw.h"
32 #include "common/intel_l3_config.h"
33 #include "common/intel_sample_positions.h"
34 #include "nir/nir_xfb_info.h"
35 #include "vk_util.h"
36 #include "vk_format.h"
37 #include "vk_log.h"
38 #include "vk_render_pass.h"
39
40 static inline struct anv_batch *
anv_gfx_pipeline_add(struct anv_graphics_pipeline * pipeline,struct anv_gfx_state_ptr * ptr,uint32_t n_dwords)41 anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
42 struct anv_gfx_state_ptr *ptr,
43 uint32_t n_dwords)
44 {
45 struct anv_batch *batch = &pipeline->base.base.batch;
46
47 assert(ptr->len == 0 ||
48 (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
49 if (ptr->len == 0)
50 ptr->offset = (batch->next - batch->start) / 4;
51 ptr->len += n_dwords;
52
53 return batch;
54 }
55
56 #define anv_pipeline_emit_tmp(pipeline, field, cmd, name) \
57 for (struct cmd name = { __anv_cmd_header(cmd) }, \
58 *_dst = (void *) field; \
59 __builtin_expect(_dst != NULL, 1); \
60 ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch, \
61 _dst, &name); \
62 VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
63 _dst = NULL; \
64 }))
65
66 #define anv_pipeline_emit(pipeline, state, cmd, name) \
67 for (struct cmd name = { __anv_cmd_header(cmd) }, \
68 *_dst = anv_batch_emit_dwords( \
69 anv_gfx_pipeline_add(pipeline, \
70 &(pipeline)->state, \
71 __anv_cmd_length(cmd)), \
72 __anv_cmd_length(cmd)); \
73 __builtin_expect(_dst != NULL, 1); \
74 ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch, \
75 _dst, &name); \
76 VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
77 _dst = NULL; \
78 }))
79
80 #define anv_pipeline_emit_merge(pipeline, state, dwords, cmd, name) \
81 for (struct cmd name = { 0 }, \
82 *_dst = anv_batch_emit_dwords( \
83 anv_gfx_pipeline_add(pipeline, \
84 &(pipeline)->state, \
85 __anv_cmd_length(cmd)), \
86 __anv_cmd_length(cmd)); \
87 __builtin_expect(_dst != NULL, 1); \
88 ({ uint32_t _partial[__anv_cmd_length(cmd)]; \
89 assert((pipeline)->state.len == __anv_cmd_length(cmd)); \
90 __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch, \
91 _partial, &name); \
92 for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \
93 ((uint32_t *)_dst)[i] = _partial[i] | dwords[i]; \
94 } \
95 VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
96 _dst = NULL; \
97 }))
98
99 #define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({ \
100 void *__dst = anv_batch_emit_dwords( \
101 anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n); \
102 if (__dst) { \
103 struct cmd __template = { \
104 __anv_cmd_header(cmd), \
105 .DWordLength = n - __anv_cmd_length_bias(cmd), \
106 __VA_ARGS__ \
107 }; \
108 __anv_cmd_pack(cmd)(&pipeline->base.base.batch, \
109 __dst, &__template); \
110 } \
111 __dst; \
112 })
113
114 #define pipeline_needs_protected(pipeline) \
115 ((pipeline)->device->vk.enabled_features.protectedMemory)
116
117 static uint32_t
vertex_element_comp_control(enum isl_format format,unsigned comp)118 vertex_element_comp_control(enum isl_format format, unsigned comp)
119 {
120 uint8_t bits;
121 switch (comp) {
122 case 0: bits = isl_format_layouts[format].channels.r.bits; break;
123 case 1: bits = isl_format_layouts[format].channels.g.bits; break;
124 case 2: bits = isl_format_layouts[format].channels.b.bits; break;
125 case 3: bits = isl_format_layouts[format].channels.a.bits; break;
126 default: unreachable("Invalid component");
127 }
128
129 /*
130 * Take in account hardware restrictions when dealing with 64-bit floats.
131 *
132 * From Broadwell spec, command reference structures, page 586:
133 * "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
134 * 64-bit components are stored * in the URB without any conversion. In
135 * this case, vertex elements must be written as 128 or 256 bits, with
136 * VFCOMP_STORE_0 being used to pad the output as required. E.g., if
137 * R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
138 * Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
139 * set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
140 * Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
141 * a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
142 * Component 3 to be specified as VFCOMP_STORE_0 in order to output a
143 * 256-bit vertex element."
144 */
145 if (bits) {
146 return VFCOMP_STORE_SRC;
147 } else if (comp >= 2 &&
148 !isl_format_layouts[format].channels.b.bits &&
149 isl_format_layouts[format].channels.r.type == ISL_RAW) {
150 /* When emitting 64-bit attributes, we need to write either 128 or 256
151 * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
152 * VFCOMP_STORE_0 to pad the written chunk */
153 return VFCOMP_NOSTORE;
154 } else if (comp < 3 ||
155 isl_format_layouts[format].channels.r.type == ISL_RAW) {
156 /* Note we need to pad with value 0, not 1, due hardware restrictions
157 * (see comment above) */
158 return VFCOMP_STORE_0;
159 } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
160 isl_format_layouts[format].channels.r.type == ISL_SINT) {
161 assert(comp == 3);
162 return VFCOMP_STORE_1_INT;
163 } else {
164 assert(comp == 3);
165 return VFCOMP_STORE_1_FP;
166 }
167 }
168
169 static void
emit_ves_vf_instancing(struct anv_batch * batch,uint32_t * vertex_element_dws,struct anv_graphics_pipeline * pipeline,const struct vk_vertex_input_state * vi,bool emit_in_pipeline)170 emit_ves_vf_instancing(struct anv_batch *batch,
171 uint32_t *vertex_element_dws,
172 struct anv_graphics_pipeline *pipeline,
173 const struct vk_vertex_input_state *vi,
174 bool emit_in_pipeline)
175 {
176 const struct anv_device *device = pipeline->base.base.device;
177 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
178 const uint64_t inputs_read = vs_prog_data->inputs_read;
179 const uint64_t double_inputs_read =
180 vs_prog_data->double_inputs_read & inputs_read;
181 assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
182 const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
183 const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
184
185 for (uint32_t i = 0; i < pipeline->vs_input_elements; i++) {
186 /* The SKL docs for VERTEX_ELEMENT_STATE say:
187 *
188 * "All elements must be valid from Element[0] to the last valid
189 * element. (I.e. if Element[2] is valid then Element[1] and
190 * Element[0] must also be valid)."
191 *
192 * The SKL docs for 3D_Vertex_Component_Control say:
193 *
194 * "Don't store this component. (Not valid for Component 0, but can
195 * be used for Component 1-3)."
196 *
197 * So we can't just leave a vertex element blank and hope for the best.
198 * We have to tell the VF hardware to put something in it; so we just
199 * store a bunch of zero.
200 *
201 * TODO: Compact vertex elements so we never end up with holes.
202 */
203 struct GENX(VERTEX_ELEMENT_STATE) element = {
204 .Valid = true,
205 .Component0Control = VFCOMP_STORE_0,
206 .Component1Control = VFCOMP_STORE_0,
207 .Component2Control = VFCOMP_STORE_0,
208 .Component3Control = VFCOMP_STORE_0,
209 };
210 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
211 &vertex_element_dws[i * 2],
212 &element);
213 }
214
215 u_foreach_bit(a, vi->attributes_valid) {
216 enum isl_format format = anv_get_isl_format(device->physical,
217 vi->attributes[a].format,
218 VK_IMAGE_ASPECT_COLOR_BIT,
219 VK_IMAGE_TILING_LINEAR);
220 assume(format < ISL_NUM_FORMATS);
221
222 uint32_t binding = vi->attributes[a].binding;
223 assert(binding < MAX_VBS);
224
225 if ((elements & (1 << a)) == 0)
226 continue; /* Binding unused */
227
228 uint32_t slot =
229 __builtin_popcount(elements & ((1 << a) - 1)) -
230 DIV_ROUND_UP(__builtin_popcount(elements_double &
231 ((1 << a) -1)), 2);
232
233 struct GENX(VERTEX_ELEMENT_STATE) element = {
234 .VertexBufferIndex = vi->attributes[a].binding,
235 .Valid = true,
236 .SourceElementFormat = format,
237 .EdgeFlagEnable = false,
238 .SourceElementOffset = vi->attributes[a].offset,
239 .Component0Control = vertex_element_comp_control(format, 0),
240 .Component1Control = vertex_element_comp_control(format, 1),
241 .Component2Control = vertex_element_comp_control(format, 2),
242 .Component3Control = vertex_element_comp_control(format, 3),
243 };
244 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
245 &vertex_element_dws[slot * 2],
246 &element);
247
248 /* On Broadwell and later, we have a separate VF_INSTANCING packet
249 * that controls instancing. On Haswell and prior, that's part of
250 * VERTEX_BUFFER_STATE which we emit later.
251 */
252 if (emit_in_pipeline) {
253 anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
254 bool per_instance = vi->bindings[binding].input_rate ==
255 VK_VERTEX_INPUT_RATE_INSTANCE;
256 uint32_t divisor = vi->bindings[binding].divisor *
257 pipeline->instance_multiplier;
258
259 vfi.InstancingEnable = per_instance;
260 vfi.VertexElementIndex = slot;
261 vfi.InstanceDataStepRate = per_instance ? divisor : 1;
262 }
263 } else {
264 anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
265 bool per_instance = vi->bindings[binding].input_rate ==
266 VK_VERTEX_INPUT_RATE_INSTANCE;
267 uint32_t divisor = vi->bindings[binding].divisor *
268 pipeline->instance_multiplier;
269
270 vfi.InstancingEnable = per_instance;
271 vfi.VertexElementIndex = slot;
272 vfi.InstanceDataStepRate = per_instance ? divisor : 1;
273 }
274 }
275 }
276 }
277
278 void
genX(batch_emit_vertex_input)279 genX(batch_emit_vertex_input)(struct anv_batch *batch,
280 struct anv_device *device,
281 struct anv_graphics_pipeline *pipeline,
282 const struct vk_vertex_input_state *vi)
283 {
284 const uint32_t ve_count =
285 pipeline->vs_input_elements + pipeline->svgs_count;
286 const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count);
287 uint32_t *p = anv_batch_emitn(batch, num_dwords,
288 GENX(3DSTATE_VERTEX_ELEMENTS));
289 if (p == NULL)
290 return;
291
292 if (ve_count == 0) {
293 memcpy(p + 1, device->physical->empty_vs_input,
294 sizeof(device->physical->empty_vs_input));
295 } else if (ve_count == pipeline->vertex_input_elems) {
296 /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so everything is
297 * in pipeline->vertex_input_data and we can just memcpy
298 */
299 memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count);
300 anv_batch_emit_pipeline_state(batch, pipeline, final.vf_instancing);
301 } else {
302 assert(pipeline->final.vf_instancing.len == 0);
303 /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
304 emit_ves_vf_instancing(batch, p + 1, pipeline, vi,
305 false /* emit_in_pipeline */);
306 /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
307 memcpy(p + 1 + 2 * pipeline->vs_input_elements,
308 pipeline->vertex_input_data,
309 4 * 2 * pipeline->vertex_input_elems);
310 }
311 }
312
313 static void
emit_vertex_input(struct anv_graphics_pipeline * pipeline,const struct vk_graphics_pipeline_state * state,const struct vk_vertex_input_state * vi)314 emit_vertex_input(struct anv_graphics_pipeline *pipeline,
315 const struct vk_graphics_pipeline_state *state,
316 const struct vk_vertex_input_state *vi)
317 {
318 /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
319 * everything in gfx8_cmd_buffer.c
320 */
321 if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
322 emit_ves_vf_instancing(NULL,
323 pipeline->vertex_input_data,
324 pipeline, vi, true /* emit_in_pipeline */);
325 }
326
327 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
328 const bool needs_svgs_elem = pipeline->svgs_count > 1 ||
329 !vs_prog_data->uses_drawid;
330 const uint32_t id_slot = pipeline->vs_input_elements;
331 const uint32_t drawid_slot = id_slot + needs_svgs_elem;
332 if (pipeline->svgs_count > 0) {
333 assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
334 uint32_t slot_offset =
335 pipeline->vertex_input_elems - pipeline->svgs_count;
336
337 if (needs_svgs_elem) {
338 #if GFX_VER < 11
339 /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
340 * "Within a VERTEX_ELEMENT_STATE structure, if a Component
341 * Control field is set to something other than VFCOMP_STORE_SRC,
342 * no higher-numbered Component Control fields may be set to
343 * VFCOMP_STORE_SRC"
344 *
345 * This means, that if we have BaseInstance, we need BaseVertex as
346 * well. Just do all or nothing.
347 */
348 uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
349 vs_prog_data->uses_baseinstance) ?
350 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
351 #endif
352
353 struct GENX(VERTEX_ELEMENT_STATE) element = {
354 .VertexBufferIndex = ANV_SVGS_VB_INDEX,
355 .Valid = true,
356 .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
357 #if GFX_VER >= 11
358 /* On gen11, these are taken care of by extra parameter slots */
359 .Component0Control = VFCOMP_STORE_0,
360 .Component1Control = VFCOMP_STORE_0,
361 #else
362 .Component0Control = base_ctrl,
363 .Component1Control = base_ctrl,
364 #endif
365 .Component2Control = VFCOMP_STORE_0,
366 .Component3Control = VFCOMP_STORE_0,
367 };
368 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
369 &pipeline->vertex_input_data[slot_offset * 2],
370 &element);
371 slot_offset++;
372
373 anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
374 GENX(3DSTATE_VF_INSTANCING), vfi) {
375 vfi.VertexElementIndex = id_slot;
376 }
377 }
378
379 if (vs_prog_data->uses_drawid) {
380 struct GENX(VERTEX_ELEMENT_STATE) element = {
381 .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
382 .Valid = true,
383 .SourceElementFormat = ISL_FORMAT_R32_UINT,
384 #if GFX_VER >= 11
385 /* On gen11, this is taken care of by extra parameter slots */
386 .Component0Control = VFCOMP_STORE_0,
387 #else
388 .Component0Control = VFCOMP_STORE_SRC,
389 #endif
390 .Component1Control = VFCOMP_STORE_0,
391 .Component2Control = VFCOMP_STORE_0,
392 .Component3Control = VFCOMP_STORE_0,
393 };
394 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
395 &pipeline->vertex_input_data[slot_offset * 2],
396 &element);
397 slot_offset++;
398
399 anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
400 GENX(3DSTATE_VF_INSTANCING), vfi) {
401 vfi.VertexElementIndex = drawid_slot;
402 }
403 }
404 }
405
406 anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
407 sgvs.VertexIDEnable = vs_prog_data->uses_vertexid;
408 sgvs.VertexIDComponentNumber = 2;
409 sgvs.VertexIDElementOffset = id_slot;
410 sgvs.InstanceIDEnable = vs_prog_data->uses_instanceid;
411 sgvs.InstanceIDComponentNumber = 3;
412 sgvs.InstanceIDElementOffset = id_slot;
413 }
414
415 #if GFX_VER >= 11
416 anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
417 /* gl_BaseVertex */
418 sgvs.XP0Enable = vs_prog_data->uses_firstvertex;
419 sgvs.XP0SourceSelect = XP0_PARAMETER;
420 sgvs.XP0ComponentNumber = 0;
421 sgvs.XP0ElementOffset = id_slot;
422
423 /* gl_BaseInstance */
424 sgvs.XP1Enable = vs_prog_data->uses_baseinstance;
425 sgvs.XP1SourceSelect = StartingInstanceLocation;
426 sgvs.XP1ComponentNumber = 1;
427 sgvs.XP1ElementOffset = id_slot;
428
429 /* gl_DrawID */
430 sgvs.XP2Enable = vs_prog_data->uses_drawid;
431 sgvs.XP2ComponentNumber = 0;
432 sgvs.XP2ElementOffset = drawid_slot;
433 }
434 #endif
435 }
436
437 void
genX(emit_urb_setup)438 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
439 const struct intel_l3_config *l3_config,
440 VkShaderStageFlags active_stages,
441 const struct intel_urb_config *urb_cfg_in,
442 struct intel_urb_config *urb_cfg_out,
443 enum intel_urb_deref_block_size *deref_block_size)
444 {
445 const struct intel_device_info *devinfo = device->info;
446
447 bool constrained;
448 intel_get_urb_config(devinfo, l3_config,
449 active_stages &
450 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
451 active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
452 urb_cfg_out, deref_block_size,
453 &constrained);
454
455 #if INTEL_NEEDS_WA_16014912113
456 if (intel_urb_setup_changed(urb_cfg_in, urb_cfg_out,
457 MESA_SHADER_TESS_EVAL) && urb_cfg_in->size[0] != 0) {
458 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
459 #if GFX_VER >= 12
460 anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
461 urb._3DCommandSubOpcode += i;
462 urb.VSURBEntryAllocationSize = urb_cfg_in->size[i] - 1;
463 urb.VSURBStartingAddressSlice0 = urb_cfg_in->start[i];
464 urb.VSURBStartingAddressSliceN = urb_cfg_in->start[i];
465 urb.VSNumberofURBEntriesSlice0 = i == 0 ? 256 : 0;
466 urb.VSNumberofURBEntriesSliceN = i == 0 ? 256 : 0;
467 }
468 #else
469 anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
470 urb._3DCommandSubOpcode += i;
471 urb.VSURBStartingAddress = urb_cfg_in->start[i];
472 urb.VSURBEntryAllocationSize = urb_cfg_in->size[i] - 1;
473 urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
474 }
475 #endif
476 }
477 genx_batch_emit_pipe_control(batch, device->info, _3D,
478 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
479 }
480 #endif
481
482 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
483 #if GFX_VER >= 12
484 anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
485 urb._3DCommandSubOpcode += i;
486 urb.VSURBEntryAllocationSize = urb_cfg_out->size[i] - 1;
487 urb.VSURBStartingAddressSlice0 = urb_cfg_out->start[i];
488 urb.VSURBStartingAddressSliceN = urb_cfg_out->start[i];
489 urb.VSNumberofURBEntriesSlice0 = urb_cfg_out->entries[i];
490 urb.VSNumberofURBEntriesSliceN = urb_cfg_out->entries[i];
491 }
492 #else
493 anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
494 urb._3DCommandSubOpcode += i;
495 urb.VSURBStartingAddress = urb_cfg_out->start[i];
496 urb.VSURBEntryAllocationSize = urb_cfg_out->size[i] - 1;
497 urb.VSNumberofURBEntries = urb_cfg_out->entries[i];
498 }
499 #endif
500 }
501
502 #if GFX_VERx10 >= 125
503 if (device->vk.enabled_extensions.EXT_mesh_shader) {
504 anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
505 anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
506 }
507 #endif
508 }
509
510 #if GFX_VERx10 >= 125
511 static void
emit_urb_setup_mesh(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)512 emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
513 enum intel_urb_deref_block_size *deref_block_size)
514 {
515 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
516
517 const struct brw_task_prog_data *task_prog_data =
518 anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ?
519 get_task_prog_data(pipeline) : NULL;
520 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
521
522 const struct intel_mesh_urb_allocation alloc =
523 intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config,
524 task_prog_data ? task_prog_data->map.size_dw : 0,
525 mesh_prog_data->map.size_dw);
526
527 /* Zero out the primitive pipeline URB allocations. */
528 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
529 #if GFX_VER >= 12
530 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_VS), urb) {
531 urb._3DCommandSubOpcode += i;
532 }
533 #else
534 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
535 urb._3DCommandSubOpcode += i;
536 }
537 #endif
538 }
539
540 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
541 if (task_prog_data) {
542 urb.TASKURBEntryAllocationSize = alloc.task_entry_size_64b - 1;
543 urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
544 urb.TASKNumberofURBEntriesSliceN = alloc.task_entries;
545 urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb;
546 urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb;
547 }
548 }
549
550 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
551 urb.MESHURBEntryAllocationSize = alloc.mesh_entry_size_64b - 1;
552 urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
553 urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
554 urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb;
555 urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb;
556 }
557
558 *deref_block_size = alloc.deref_block_size;
559 }
560 #endif
561
562 static void
emit_urb_setup(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)563 emit_urb_setup(struct anv_graphics_pipeline *pipeline,
564 enum intel_urb_deref_block_size *deref_block_size)
565 {
566 #if GFX_VERx10 >= 125
567 if (anv_pipeline_is_mesh(pipeline)) {
568 emit_urb_setup_mesh(pipeline, deref_block_size);
569 return;
570 }
571 #endif
572 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
573 const struct brw_vue_prog_data *prog_data =
574 !anv_pipeline_has_stage(pipeline, i) ? NULL :
575 (const struct brw_vue_prog_data *) pipeline->base.shaders[i]->prog_data;
576
577 pipeline->urb_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
578 }
579
580 struct anv_device *device = pipeline->base.base.device;
581 const struct intel_device_info *devinfo = device->info;
582
583
584 bool constrained;
585 intel_get_urb_config(devinfo,
586 pipeline->base.base.l3_config,
587 pipeline->base.base.active_stages &
588 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
589 pipeline->base.base.active_stages &
590 VK_SHADER_STAGE_GEOMETRY_BIT,
591 &pipeline->urb_cfg, deref_block_size,
592 &constrained);
593
594 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
595 #if GFX_VER >= 12
596 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_VS), urb) {
597 urb._3DCommandSubOpcode += i;
598 urb.VSURBEntryAllocationSize = pipeline->urb_cfg.size[i] - 1;
599 urb.VSURBStartingAddressSlice0 = pipeline->urb_cfg.start[i];
600 urb.VSURBStartingAddressSliceN = pipeline->urb_cfg.start[i];
601 urb.VSNumberofURBEntriesSlice0 = pipeline->urb_cfg.entries[i];
602 urb.VSNumberofURBEntriesSliceN = pipeline->urb_cfg.entries[i];
603 }
604 #else
605 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
606 urb._3DCommandSubOpcode += i;
607 urb.VSURBStartingAddress = pipeline->urb_cfg.start[i];
608 urb.VSURBEntryAllocationSize = pipeline->urb_cfg.size[i] - 1;
609 urb.VSNumberofURBEntries = pipeline->urb_cfg.entries[i];
610 }
611 #endif
612 }
613
614 #if GFX_VERx10 >= 125
615 if (device->vk.enabled_extensions.EXT_mesh_shader) {
616 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
617 anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
618 }
619 #endif
620
621 }
622
623 static bool
sbe_primitive_id_override(struct anv_graphics_pipeline * pipeline)624 sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline)
625 {
626 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
627 if (!wm_prog_data)
628 return false;
629
630 if (anv_pipeline_is_mesh(pipeline)) {
631 const struct brw_mesh_prog_data *mesh_prog_data =
632 get_mesh_prog_data(pipeline);
633 const struct brw_mue_map *mue = &mesh_prog_data->map;
634 return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
635 mue->start_dw[VARYING_SLOT_PRIMITIVE_ID] == -1;
636 }
637
638 const struct intel_vue_map *fs_input_map =
639 &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
640
641 return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
642 fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1;
643 }
644
645 static void
emit_3dstate_sbe(struct anv_graphics_pipeline * pipeline)646 emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
647 {
648 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
649
650 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
651 anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
652 anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
653 #if GFX_VERx10 >= 125
654 if (anv_pipeline_is_mesh(pipeline))
655 anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
656 #endif
657 return;
658 }
659
660 anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
661 anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
662
663 /* TODO(mesh): Figure out cases where we need attribute swizzling. See also
664 * calculate_urb_setup() and related functions.
665 */
666 sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
667 sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
668 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
669 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
670
671 for (unsigned i = 0; i < 32; i++)
672 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
673
674 if (anv_pipeline_is_primitive(pipeline)) {
675 const struct intel_vue_map *fs_input_map =
676 &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
677
678 int first_slot =
679 brw_compute_first_urb_slot_required(wm_prog_data->inputs,
680 fs_input_map);
681 assert(first_slot % 2 == 0);
682 unsigned urb_entry_read_offset = first_slot / 2;
683 int max_source_attr = 0;
684 for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
685 uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
686 int input_index = wm_prog_data->urb_setup[attr];
687
688 assert(0 <= input_index);
689
690 /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
691 * VUE header
692 */
693 if (attr == VARYING_SLOT_VIEWPORT ||
694 attr == VARYING_SLOT_LAYER ||
695 attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
696 continue;
697 }
698
699 if (attr == VARYING_SLOT_PNTC) {
700 sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
701 continue;
702 }
703
704 const int slot = fs_input_map->varying_to_slot[attr];
705
706 if (slot == -1) {
707 /* This attribute does not exist in the VUE--that means that
708 * the vertex shader did not write to it. It could be that it's
709 * a regular varying read by the fragment shader but not
710 * written by the vertex shader or it's gl_PrimitiveID. In the
711 * first case the value is undefined, in the second it needs to
712 * be gl_PrimitiveID.
713 */
714 swiz.Attribute[input_index].ConstantSource = PRIM_ID;
715 swiz.Attribute[input_index].ComponentOverrideX = true;
716 swiz.Attribute[input_index].ComponentOverrideY = true;
717 swiz.Attribute[input_index].ComponentOverrideZ = true;
718 swiz.Attribute[input_index].ComponentOverrideW = true;
719 continue;
720 }
721
722 /* We have to subtract two slots to account for the URB entry
723 * output read offset in the VS and GS stages.
724 */
725 const int source_attr = slot - 2 * urb_entry_read_offset;
726 assert(source_attr >= 0 && source_attr < 32);
727 max_source_attr = MAX2(max_source_attr, source_attr);
728 /* The hardware can only do overrides on 16 overrides at a time,
729 * and the other up to 16 have to be lined up so that the input
730 * index = the output index. We'll need to do some tweaking to
731 * make sure that's the case.
732 */
733 if (input_index < 16)
734 swiz.Attribute[input_index].SourceAttribute = source_attr;
735 else
736 assert(source_attr == input_index);
737 }
738
739 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
740 sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
741 sbe.ForceVertexURBEntryReadOffset = true;
742 sbe.ForceVertexURBEntryReadLength = true;
743
744 /* Ask the hardware to supply PrimitiveID if the fragment shader
745 * reads it but a previous stage didn't write one.
746 */
747 if (sbe_primitive_id_override(pipeline)) {
748 sbe.PrimitiveIDOverrideAttributeSelect =
749 wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
750 sbe.PrimitiveIDOverrideComponentX = true;
751 sbe.PrimitiveIDOverrideComponentY = true;
752 sbe.PrimitiveIDOverrideComponentZ = true;
753 sbe.PrimitiveIDOverrideComponentW = true;
754 }
755 } else {
756 assert(anv_pipeline_is_mesh(pipeline));
757 #if GFX_VERx10 >= 125
758 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
759 anv_pipeline_emit(pipeline, final.sbe_mesh,
760 GENX(3DSTATE_SBE_MESH), sbe_mesh) {
761 const struct brw_mue_map *mue = &mesh_prog_data->map;
762
763 assert(mue->per_vertex_header_size_dw % 8 == 0);
764 sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
765 sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
766
767 /* Clip distance array is passed in the per-vertex header so that
768 * it can be consumed by the HW. If user wants to read it in the
769 * FS, adjust the offset and length to cover it. Conveniently it
770 * is at the end of the per-vertex header, right before per-vertex
771 * attributes.
772 *
773 * Note that FS attribute reading must be aware that the clip
774 * distances have fixed position.
775 */
776 if (mue->per_vertex_header_size_dw > 8 &&
777 (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
778 wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
779 sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
780 sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
781 }
782
783 if (mue->user_data_in_vertex_header) {
784 sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
785 sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
786 }
787
788 assert(mue->per_primitive_header_size_dw % 8 == 0);
789 sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
790 mue->per_primitive_header_size_dw / 8;
791 sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
792 DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
793
794 /* Just like with clip distances, if Primitive Shading Rate,
795 * Viewport Index or Layer is read back in the FS, adjust the
796 * offset and length to cover the Primitive Header, where PSR,
797 * Viewport Index & Layer are stored.
798 */
799 if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
800 wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
801 wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
802 mue->user_data_in_primitive_header) {
803 assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
804 sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
805 sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
806 }
807 }
808 #endif
809 }
810 }
811 }
812 }
813
814 static void
emit_rs_state(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_rasterization_state * rs,const struct vk_multisample_state * ms,const struct vk_render_pass_state * rp,enum intel_urb_deref_block_size urb_deref_block_size)815 emit_rs_state(struct anv_graphics_pipeline *pipeline,
816 const struct vk_input_assembly_state *ia,
817 const struct vk_rasterization_state *rs,
818 const struct vk_multisample_state *ms,
819 const struct vk_render_pass_state *rp,
820 enum intel_urb_deref_block_size urb_deref_block_size)
821 {
822 anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
823 sf.ViewportTransformEnable = true;
824 sf.StatisticsEnable = true;
825 sf.VertexSubPixelPrecisionSelect = _8Bit;
826 sf.AALineDistanceMode = true;
827
828 #if GFX_VER >= 12
829 sf.DerefBlockSize = urb_deref_block_size;
830 #endif
831
832 bool point_from_shader;
833 if (anv_pipeline_is_primitive(pipeline)) {
834 const struct brw_vue_prog_data *last_vue_prog_data =
835 anv_pipeline_get_last_vue_prog_data(pipeline);
836 point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
837 } else {
838 assert(anv_pipeline_is_mesh(pipeline));
839 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
840 point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
841 }
842
843 if (point_from_shader) {
844 sf.PointWidthSource = Vertex;
845 } else {
846 sf.PointWidthSource = State;
847 sf.PointWidth = 1.0;
848 }
849 }
850 }
851
852 static void
emit_3dstate_clip(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)853 emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
854 const struct vk_input_assembly_state *ia,
855 const struct vk_viewport_state *vp,
856 const struct vk_rasterization_state *rs)
857 {
858 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
859 (void) wm_prog_data;
860
861 anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
862 clip.ClipEnable = true;
863 clip.StatisticsEnable = true;
864 clip.EarlyCullEnable = true;
865 clip.GuardbandClipTestEnable = true;
866
867 clip.VertexSubPixelPrecisionSelect = _8Bit;
868 clip.ClipMode = CLIPMODE_NORMAL;
869
870 clip.MinimumPointWidth = 0.125;
871 clip.MaximumPointWidth = 255.875;
872
873 /* TODO(mesh): Multiview. */
874 if (anv_pipeline_is_primitive(pipeline)) {
875 const struct brw_vue_prog_data *last =
876 anv_pipeline_get_last_vue_prog_data(pipeline);
877
878 /* From the Vulkan 1.0.45 spec:
879 *
880 * "If the last active vertex processing stage shader entry point's
881 * interface does not include a variable decorated with Layer, then
882 * the first layer is used."
883 */
884 clip.ForceZeroRTAIndexEnable =
885 !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
886
887 } else if (anv_pipeline_is_mesh(pipeline)) {
888 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
889
890 clip.ForceZeroRTAIndexEnable =
891 mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
892 }
893
894 clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
895 wm_prog_data->uses_nonperspective_interp_modes : 0;
896 }
897
898 #if GFX_VERx10 >= 125
899 if (anv_pipeline_is_mesh(pipeline)) {
900 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
901 anv_pipeline_emit(pipeline, final.clip_mesh,
902 GENX(3DSTATE_CLIP_MESH), clip_mesh) {
903 clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
904 clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
905 clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
906 }
907 }
908 #endif
909 }
910
911 static void
emit_3dstate_streamout(struct anv_graphics_pipeline * pipeline,const struct vk_rasterization_state * rs)912 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
913 const struct vk_rasterization_state *rs)
914 {
915 const struct brw_vue_prog_data *prog_data =
916 anv_pipeline_get_last_vue_prog_data(pipeline);
917 const struct intel_vue_map *vue_map = &prog_data->vue_map;
918
919 nir_xfb_info *xfb_info;
920 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
921 xfb_info = pipeline->base.shaders[MESA_SHADER_GEOMETRY]->xfb_info;
922 else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
923 xfb_info = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
924 else
925 xfb_info = pipeline->base.shaders[MESA_SHADER_VERTEX]->xfb_info;
926
927 if (xfb_info) {
928 struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
929 int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
930 int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
931
932 memset(so_decl, 0, sizeof(so_decl));
933
934 for (unsigned i = 0; i < xfb_info->output_count; i++) {
935 const nir_xfb_output_info *output = &xfb_info->outputs[i];
936 unsigned buffer = output->buffer;
937 unsigned stream = xfb_info->buffer_to_stream[buffer];
938
939 /* Our hardware is unusual in that it requires us to program SO_DECLs
940 * for fake "hole" components, rather than simply taking the offset
941 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
942 * program as many size = 4 holes as we can, then a final hole to
943 * accommodate the final 1, 2, or 3 remaining.
944 */
945 int hole_dwords = (output->offset - next_offset[buffer]) / 4;
946 while (hole_dwords > 0) {
947 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
948 .HoleFlag = 1,
949 .OutputBufferSlot = buffer,
950 .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
951 };
952 hole_dwords -= 4;
953 }
954
955 int varying = output->location;
956 uint8_t component_mask = output->component_mask;
957 /* VARYING_SLOT_PSIZ contains four scalar fields packed together:
958 * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x
959 * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y
960 * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z
961 * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w
962 */
963 if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
964 varying = VARYING_SLOT_PSIZ;
965 component_mask = 1 << 0; // SO_DECL_COMPMASK_X
966 } else if (varying == VARYING_SLOT_LAYER) {
967 varying = VARYING_SLOT_PSIZ;
968 component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
969 } else if (varying == VARYING_SLOT_VIEWPORT) {
970 varying = VARYING_SLOT_PSIZ;
971 component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
972 } else if (varying == VARYING_SLOT_PSIZ) {
973 component_mask = 1 << 3; // SO_DECL_COMPMASK_W
974 }
975
976 next_offset[buffer] = output->offset +
977 __builtin_popcount(component_mask) * 4;
978
979 const int slot = vue_map->varying_to_slot[varying];
980 if (slot < 0) {
981 /* This can happen if the shader never writes to the varying.
982 * Insert a hole instead of actual varying data.
983 */
984 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
985 .HoleFlag = true,
986 .OutputBufferSlot = buffer,
987 .ComponentMask = component_mask,
988 };
989 } else {
990 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
991 .OutputBufferSlot = buffer,
992 .RegisterIndex = slot,
993 .ComponentMask = component_mask,
994 };
995 }
996 }
997
998 int max_decls = 0;
999 for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
1000 max_decls = MAX2(max_decls, decls[s]);
1001
1002 uint8_t sbs[MAX_XFB_STREAMS] = { };
1003 for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
1004 if (xfb_info->buffers_written & (1 << b))
1005 sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
1006 }
1007
1008 uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
1009 3 + 2 * max_decls,
1010 GENX(3DSTATE_SO_DECL_LIST),
1011 .StreamtoBufferSelects0 = sbs[0],
1012 .StreamtoBufferSelects1 = sbs[1],
1013 .StreamtoBufferSelects2 = sbs[2],
1014 .StreamtoBufferSelects3 = sbs[3],
1015 .NumEntries0 = decls[0],
1016 .NumEntries1 = decls[1],
1017 .NumEntries2 = decls[2],
1018 .NumEntries3 = decls[3]);
1019
1020 for (int i = 0; i < max_decls; i++) {
1021 GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
1022 &(struct GENX(SO_DECL_ENTRY)) {
1023 .Stream0Decl = so_decl[0][i],
1024 .Stream1Decl = so_decl[1][i],
1025 .Stream2Decl = so_decl[2][i],
1026 .Stream3Decl = so_decl[3][i],
1027 });
1028 }
1029 }
1030
1031 anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
1032 if (xfb_info) {
1033 pipeline->uses_xfb = true;
1034
1035 so.SOFunctionEnable = true;
1036 so.SOStatisticsEnable = true;
1037
1038 so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
1039 so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
1040 so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
1041 so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
1042
1043 int urb_entry_read_offset = 0;
1044 int urb_entry_read_length =
1045 (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
1046
1047 /* We always read the whole vertex. This could be reduced at some
1048 * point by reading less and offsetting the register index in the
1049 * SO_DECLs.
1050 */
1051 so.Stream0VertexReadOffset = urb_entry_read_offset;
1052 so.Stream0VertexReadLength = urb_entry_read_length - 1;
1053 so.Stream1VertexReadOffset = urb_entry_read_offset;
1054 so.Stream1VertexReadLength = urb_entry_read_length - 1;
1055 so.Stream2VertexReadOffset = urb_entry_read_offset;
1056 so.Stream2VertexReadLength = urb_entry_read_length - 1;
1057 so.Stream3VertexReadOffset = urb_entry_read_offset;
1058 so.Stream3VertexReadLength = urb_entry_read_length - 1;
1059 }
1060 }
1061 }
1062
1063 static inline uint32_t
get_sampler_count(const struct anv_shader_bin * bin)1064 get_sampler_count(const struct anv_shader_bin *bin)
1065 {
1066 /* We can potentially have way more than 32 samplers and that's ok.
1067 * However, the 3DSTATE_XS packets only have 3 bits to specify how
1068 * many to pre-fetch and all values above 4 are marked reserved.
1069 */
1070 return DIV_ROUND_UP(CLAMP(bin->bind_map.sampler_count, 0, 16), 4);
1071 }
1072
1073 static UNUSED struct anv_address
get_scratch_address(struct anv_pipeline * pipeline,gl_shader_stage stage,const struct anv_shader_bin * bin)1074 get_scratch_address(struct anv_pipeline *pipeline,
1075 gl_shader_stage stage,
1076 const struct anv_shader_bin *bin)
1077 {
1078 return (struct anv_address) {
1079 .bo = anv_scratch_pool_alloc(pipeline->device,
1080 &pipeline->device->scratch_pool,
1081 stage, bin->prog_data->total_scratch),
1082 .offset = 0,
1083 };
1084 }
1085
1086 static UNUSED uint32_t
get_scratch_space(const struct anv_shader_bin * bin)1087 get_scratch_space(const struct anv_shader_bin *bin)
1088 {
1089 return ffs(bin->prog_data->total_scratch / 2048);
1090 }
1091
1092 static UNUSED uint32_t
get_scratch_surf(struct anv_pipeline * pipeline,gl_shader_stage stage,const struct anv_shader_bin * bin,bool protected)1093 get_scratch_surf(struct anv_pipeline *pipeline,
1094 gl_shader_stage stage,
1095 const struct anv_shader_bin *bin,
1096 bool protected)
1097 {
1098 if (bin->prog_data->total_scratch == 0)
1099 return 0;
1100
1101 struct anv_scratch_pool *pool = protected ?
1102 &pipeline->device->protected_scratch_pool :
1103 &pipeline->device->scratch_pool;
1104 struct anv_bo *bo =
1105 anv_scratch_pool_alloc(pipeline->device, pool,
1106 stage, bin->prog_data->total_scratch);
1107 anv_reloc_list_add_bo(pipeline->batch.relocs, bo);
1108 return anv_scratch_pool_get_surf(pipeline->device, pool,
1109 bin->prog_data->total_scratch) >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
1110 }
1111
1112 static void
emit_3dstate_vs(struct anv_graphics_pipeline * pipeline)1113 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
1114 {
1115 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1116 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1117 const struct anv_shader_bin *vs_bin =
1118 pipeline->base.shaders[MESA_SHADER_VERTEX];
1119
1120 assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
1121
1122 uint32_t vs_dwords[GENX(3DSTATE_VS_length)];
1123 anv_pipeline_emit_tmp(pipeline, vs_dwords, GENX(3DSTATE_VS), vs) {
1124 vs.Enable = true;
1125 vs.StatisticsEnable = true;
1126 vs.KernelStartPointer = vs_bin->kernel.offset;
1127 #if GFX_VER < 20
1128 vs.SIMD8DispatchEnable =
1129 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
1130 #endif
1131
1132 assert(!vs_prog_data->base.base.use_alt_mode);
1133 #if GFX_VER < 11
1134 vs.SingleVertexDispatch = false;
1135 #endif
1136 vs.VectorMaskEnable = false;
1137 /* Wa_1606682166:
1138 * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
1139 * Disable the Sampler state prefetch functionality in the SARB by
1140 * programming 0xB000[30] to '1'.
1141 */
1142 vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
1143 vs.BindingTableEntryCount = vs_bin->bind_map.surface_count;
1144 vs.FloatingPointMode = IEEE754;
1145 vs.IllegalOpcodeExceptionEnable = false;
1146 vs.SoftwareExceptionEnable = false;
1147 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
1148
1149 if (GFX_VER == 9 && devinfo->gt == 4 &&
1150 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1151 /* On Sky Lake GT4, we have experienced some hangs related to the VS
1152 * cache and tessellation. It is unknown exactly what is happening
1153 * but the Haswell docs for the "VS Reference Count Full Force Miss
1154 * Enable" field of the "Thread Mode" register refer to a HSW bug in
1155 * which the VUE handle reference count would overflow resulting in
1156 * internal reference counting bugs. My (Faith's) best guess is that
1157 * this bug cropped back up on SKL GT4 when we suddenly had more
1158 * threads in play than any previous gfx9 hardware.
1159 *
1160 * What we do know for sure is that setting this bit when
1161 * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
1162 * City when playing with DXVK (https://bugs.freedesktop.org/107280).
1163 * Disabling the vertex cache with tessellation shaders should only
1164 * have a minor performance impact as the tessellation shaders are
1165 * likely generating and processing far more geometry than the vertex
1166 * stage.
1167 */
1168 vs.VertexCacheDisable = true;
1169 }
1170
1171 vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length;
1172 vs.VertexURBEntryReadOffset = 0;
1173 vs.DispatchGRFStartRegisterForURBData =
1174 vs_prog_data->base.base.dispatch_grf_start_reg;
1175
1176 vs.UserClipDistanceClipTestEnableBitmask =
1177 vs_prog_data->base.clip_distance_mask;
1178 vs.UserClipDistanceCullTestEnableBitmask =
1179 vs_prog_data->base.cull_distance_mask;
1180
1181 #if GFX_VERx10 < 125
1182 vs.PerThreadScratchSpace = get_scratch_space(vs_bin);
1183 vs.ScratchSpaceBasePointer =
1184 get_scratch_address(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
1185 #endif
1186
1187 #if GFX_VER >= 30
1188 vs.RegistersPerThread = ptl_register_blocks(vs_prog_data->base.base.grf_used);
1189 #endif
1190 }
1191
1192 anv_pipeline_emit_merge(pipeline, final.vs, vs_dwords, GENX(3DSTATE_VS), vs) {
1193 #if GFX_VERx10 >= 125
1194 vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1195 MESA_SHADER_VERTEX,
1196 vs_bin, false);
1197 #endif
1198 }
1199 if (pipeline_needs_protected(&pipeline->base.base)) {
1200 anv_pipeline_emit_merge(pipeline, final.vs_protected,
1201 vs_dwords, GENX(3DSTATE_VS), vs) {
1202 #if GFX_VERx10 >= 125
1203 vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1204 MESA_SHADER_VERTEX,
1205 vs_bin, true);
1206 #endif
1207 }
1208 }
1209 }
1210
1211 static void
emit_3dstate_hs_ds(struct anv_graphics_pipeline * pipeline,const struct vk_tessellation_state * ts)1212 emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
1213 const struct vk_tessellation_state *ts)
1214 {
1215 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1216 anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
1217 anv_pipeline_emit(pipeline, final.hs_protected, GENX(3DSTATE_HS), hs);
1218 anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
1219 anv_pipeline_emit(pipeline, final.ds_protected, GENX(3DSTATE_DS), ds);
1220 return;
1221 }
1222
1223 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1224 const struct anv_shader_bin *tcs_bin =
1225 pipeline->base.shaders[MESA_SHADER_TESS_CTRL];
1226 const struct anv_shader_bin *tes_bin =
1227 pipeline->base.shaders[MESA_SHADER_TESS_EVAL];
1228
1229 const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1230 const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1231
1232 uint32_t hs_dwords[GENX(3DSTATE_HS_length)];
1233 anv_pipeline_emit_tmp(pipeline, hs_dwords, GENX(3DSTATE_HS), hs) {
1234 hs.Enable = true;
1235 hs.StatisticsEnable = true;
1236 hs.KernelStartPointer = tcs_bin->kernel.offset;
1237 /* Wa_1606682166 */
1238 hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
1239 hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
1240
1241 #if GFX_VER >= 12
1242 /* Wa_1604578095:
1243 *
1244 * Hang occurs when the number of max threads is less than 2 times
1245 * the number of instance count. The number of max threads must be
1246 * more than 2 times the number of instance count.
1247 */
1248 assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
1249 #endif
1250
1251 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1252 hs.IncludeVertexHandles = true;
1253 hs.InstanceCount = tcs_prog_data->instances - 1;
1254
1255 hs.VertexURBEntryReadLength = 0;
1256 hs.VertexURBEntryReadOffset = 0;
1257 hs.DispatchGRFStartRegisterForURBData =
1258 tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
1259 #if GFX_VER >= 12
1260 hs.DispatchGRFStartRegisterForURBData5 =
1261 tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
1262 #endif
1263
1264 #if GFX_VERx10 < 125
1265 hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1266 hs.ScratchSpaceBasePointer =
1267 get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
1268 #endif
1269
1270 #if GFX_VER == 12
1271 /* Patch Count threshold specifies the maximum number of patches that
1272 * will be accumulated before a thread dispatch is forced.
1273 */
1274 hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
1275 #endif
1276
1277 #if GFX_VER < 20
1278 hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
1279 #endif
1280 hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
1281
1282 #if GFX_VER >= 30
1283 hs.RegistersPerThread = ptl_register_blocks(tcs_prog_data->base.base.grf_used);
1284 #endif
1285 };
1286
1287 uint32_t ds_dwords[GENX(3DSTATE_DS_length)];
1288 anv_pipeline_emit_tmp(pipeline, ds_dwords, GENX(3DSTATE_DS), ds) {
1289 ds.Enable = true;
1290 ds.StatisticsEnable = true;
1291 ds.KernelStartPointer = tes_bin->kernel.offset;
1292 /* Wa_1606682166 */
1293 ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
1294 ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
1295 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1296
1297 ds.ComputeWCoordinateEnable =
1298 tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
1299
1300 ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1301 ds.PatchURBEntryReadOffset = 0;
1302 ds.DispatchGRFStartRegisterForURBData =
1303 tes_prog_data->base.base.dispatch_grf_start_reg;
1304
1305 #if GFX_VER < 11
1306 ds.DispatchMode =
1307 tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1308 DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1309 DISPATCH_MODE_SIMD4X2;
1310 #else
1311 assert(tes_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
1312 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
1313 #endif
1314
1315 ds.UserClipDistanceClipTestEnableBitmask =
1316 tes_prog_data->base.clip_distance_mask;
1317 ds.UserClipDistanceCullTestEnableBitmask =
1318 tes_prog_data->base.cull_distance_mask;
1319
1320 #if GFX_VER >= 12
1321 ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
1322 #endif
1323 #if GFX_VERx10 < 125
1324 ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1325 ds.ScratchSpaceBasePointer =
1326 get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
1327 #endif
1328
1329 #if GFX_VER >= 30
1330 ds.RegistersPerThread = ptl_register_blocks(tes_prog_data->base.base.grf_used);
1331 #endif
1332 }
1333
1334 anv_pipeline_emit_merge(pipeline, final.hs, hs_dwords, GENX(3DSTATE_HS), hs) {
1335 #if GFX_VERx10 >= 125
1336 hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1337 MESA_SHADER_TESS_CTRL,
1338 tcs_bin, false);
1339 #endif
1340 }
1341 anv_pipeline_emit_merge(pipeline, final.ds, ds_dwords, GENX(3DSTATE_DS), ds) {
1342 #if GFX_VERx10 >= 125
1343 ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1344 MESA_SHADER_TESS_EVAL,
1345 tes_bin, false);
1346 #endif
1347 }
1348 if (pipeline_needs_protected(&pipeline->base.base)) {
1349 anv_pipeline_emit_merge(pipeline, final.hs_protected,
1350 hs_dwords, GENX(3DSTATE_HS), hs) {
1351 #if GFX_VERx10 >= 125
1352 hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1353 MESA_SHADER_TESS_CTRL,
1354 tcs_bin, true);
1355 #endif
1356 }
1357 anv_pipeline_emit_merge(pipeline, final.ds_protected,
1358 ds_dwords, GENX(3DSTATE_DS), ds) {
1359 #if GFX_VERx10 >= 125
1360 ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1361 MESA_SHADER_TESS_EVAL,
1362 tes_bin, true);
1363 #endif
1364 }
1365 }
1366 }
1367
1368 static UNUSED bool
geom_or_tess_prim_id_used(struct anv_graphics_pipeline * pipeline)1369 geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
1370 {
1371 const struct brw_tcs_prog_data *tcs_prog_data =
1372 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ?
1373 get_tcs_prog_data(pipeline) : NULL;
1374 const struct brw_tes_prog_data *tes_prog_data =
1375 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ?
1376 get_tes_prog_data(pipeline) : NULL;
1377 const struct brw_gs_prog_data *gs_prog_data =
1378 anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) ?
1379 get_gs_prog_data(pipeline) : NULL;
1380
1381 return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
1382 (tes_prog_data && tes_prog_data->include_primitive_id) ||
1383 (gs_prog_data && gs_prog_data->include_primitive_id);
1384 }
1385
1386 static void
emit_3dstate_te(struct anv_graphics_pipeline * pipeline)1387 emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
1388 {
1389 anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
1390 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1391 const struct brw_tes_prog_data *tes_prog_data =
1392 get_tes_prog_data(pipeline);
1393
1394 te.Partitioning = tes_prog_data->partitioning;
1395 te.TEDomain = tes_prog_data->domain;
1396 te.TEEnable = true;
1397 te.MaximumTessellationFactorOdd = 63.0;
1398 te.MaximumTessellationFactorNotOdd = 64.0;
1399 #if GFX_VERx10 >= 125
1400 const struct anv_device *device = pipeline->base.base.device;
1401 if (intel_needs_workaround(device->info, 22012699309))
1402 te.TessellationDistributionMode = TEDMODE_RR_STRICT;
1403 else
1404 te.TessellationDistributionMode = TEDMODE_RR_FREE;
1405
1406 if (intel_needs_workaround(device->info, 14015055625)) {
1407 /* Wa_14015055625:
1408 *
1409 * Disable Tessellation Distribution when primitive Id is enabled.
1410 */
1411 if (sbe_primitive_id_override(pipeline) ||
1412 geom_or_tess_prim_id_used(pipeline))
1413 te.TessellationDistributionMode = TEDMODE_OFF;
1414 }
1415
1416 #if GFX_VER >= 20
1417 te.TessellationDistributionLevel = TEDLEVEL_REGION;
1418 #else
1419 te.TessellationDistributionLevel = TEDLEVEL_PATCH;
1420 #endif
1421 /* 64_TRIANGLES */
1422 te.SmallPatchThreshold = 3;
1423 /* 1K_TRIANGLES */
1424 te.TargetBlockSize = 8;
1425 /* 1K_TRIANGLES */
1426 te.LocalBOPAccumulatorThreshold = 1;
1427 #endif
1428
1429 #if GFX_VER >= 20
1430 te.NumberOfRegionsPerPatch = 2;
1431 #endif
1432 }
1433 }
1434 }
1435
1436 static void
emit_3dstate_gs(struct anv_graphics_pipeline * pipeline)1437 emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
1438 {
1439 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1440 anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
1441 anv_pipeline_emit(pipeline, partial.gs_protected, GENX(3DSTATE_GS), gs);
1442 return;
1443 }
1444
1445 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1446 const struct anv_shader_bin *gs_bin =
1447 pipeline->base.shaders[MESA_SHADER_GEOMETRY];
1448 const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
1449
1450 uint32_t gs_dwords[GENX(3DSTATE_GS_length)];
1451 anv_pipeline_emit_tmp(pipeline, gs_dwords, GENX(3DSTATE_GS), gs) {
1452 gs.Enable = true;
1453 gs.StatisticsEnable = true;
1454 gs.KernelStartPointer = gs_bin->kernel.offset;
1455 #if GFX_VER < 20
1456 gs.DispatchMode = gs_prog_data->base.dispatch_mode;
1457 #endif
1458
1459 gs.SingleProgramFlow = false;
1460 gs.VectorMaskEnable = false;
1461 /* Wa_1606682166 */
1462 gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
1463 gs.BindingTableEntryCount = gs_bin->bind_map.surface_count;
1464 gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles;
1465 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
1466
1467 gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
1468
1469 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
1470 gs.OutputTopology = gs_prog_data->output_topology;
1471 gs.ControlDataFormat = gs_prog_data->control_data_format;
1472 gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords;
1473 gs.InstanceControl = MAX2(gs_prog_data->invocations, 1) - 1;
1474
1475 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
1476 gs.StaticOutput = gs_prog_data->static_vertex_count >= 0;
1477 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
1478 gs_prog_data->static_vertex_count : 0;
1479
1480 gs.VertexURBEntryReadOffset = 0;
1481 gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1482 gs.DispatchGRFStartRegisterForURBData =
1483 gs_prog_data->base.base.dispatch_grf_start_reg;
1484
1485 gs.UserClipDistanceClipTestEnableBitmask =
1486 gs_prog_data->base.clip_distance_mask;
1487 gs.UserClipDistanceCullTestEnableBitmask =
1488 gs_prog_data->base.cull_distance_mask;
1489
1490 #if GFX_VERx10 < 125
1491 gs.PerThreadScratchSpace = get_scratch_space(gs_bin);
1492 gs.ScratchSpaceBasePointer =
1493 get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
1494 #endif
1495
1496 #if GFX_VER >= 30
1497 gs.RegistersPerThread = ptl_register_blocks(gs_prog_data->base.base.grf_used);
1498 #endif
1499 }
1500
1501 anv_pipeline_emit_merge(pipeline, partial.gs, gs_dwords, GENX(3DSTATE_GS), gs) {
1502 #if GFX_VERx10 >= 125
1503 gs.ScratchSpaceBuffer =
1504 get_scratch_surf(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin, false);
1505 #endif
1506 }
1507 if (pipeline_needs_protected(&pipeline->base.base)) {
1508 anv_pipeline_emit_merge(pipeline, partial.gs_protected,
1509 gs_dwords, GENX(3DSTATE_GS), gs) {
1510 #if GFX_VERx10 >= 125
1511 gs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base.base,
1512 MESA_SHADER_GEOMETRY,
1513 gs_bin, true);
1514 #endif
1515 }
1516 }
1517 }
1518
1519 static void
emit_3dstate_wm(struct anv_graphics_pipeline * pipeline,const struct vk_input_assembly_state * ia,const struct vk_rasterization_state * rs,const struct vk_multisample_state * ms,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)1520 emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
1521 const struct vk_input_assembly_state *ia,
1522 const struct vk_rasterization_state *rs,
1523 const struct vk_multisample_state *ms,
1524 const struct vk_color_blend_state *cb,
1525 const struct vk_render_pass_state *rp)
1526 {
1527 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1528
1529 anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
1530 wm.StatisticsEnable = true;
1531 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1532 wm.LineAntialiasingRegionWidth = _10pixels;
1533 wm.PointRasterizationRule = RASTRULE_UPPER_LEFT;
1534
1535 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1536 if (wm_prog_data->early_fragment_tests) {
1537 wm.EarlyDepthStencilControl = EDSC_PREPS;
1538 } else if (wm_prog_data->has_side_effects) {
1539 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
1540 } else {
1541 wm.EarlyDepthStencilControl = EDSC_NORMAL;
1542 }
1543 }
1544 }
1545 }
1546
1547 static void
emit_3dstate_ps(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms,const struct vk_color_blend_state * cb)1548 emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
1549 const struct vk_multisample_state *ms,
1550 const struct vk_color_blend_state *cb)
1551 {
1552 UNUSED const struct intel_device_info *devinfo =
1553 pipeline->base.base.device->info;
1554 const struct anv_shader_bin *fs_bin =
1555 pipeline->base.shaders[MESA_SHADER_FRAGMENT];
1556
1557 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1558 anv_pipeline_emit(pipeline, partial.ps, GENX(3DSTATE_PS), ps);
1559 anv_pipeline_emit(pipeline, partial.ps_protected, GENX(3DSTATE_PS), ps);
1560 return;
1561 }
1562
1563 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1564
1565 uint32_t ps_dwords[GENX(3DSTATE_PS_length)];
1566 anv_pipeline_emit_tmp(pipeline, ps_dwords, GENX(3DSTATE_PS), ps) {
1567 #if GFX_VER == 12
1568 assert(wm_prog_data->dispatch_multi == 0 ||
1569 (wm_prog_data->dispatch_multi == 16 && wm_prog_data->max_polygons == 2));
1570 ps.DualSIMD8DispatchEnable = wm_prog_data->dispatch_multi;
1571 /* XXX - No major improvement observed from enabling
1572 * overlapping subspans, but it could be helpful
1573 * in theory when the requirements listed on the
1574 * BSpec page for 3DSTATE_PS_BODY are met.
1575 */
1576 ps.OverlappingSubspansEnable = false;
1577 #endif
1578
1579 ps.SingleProgramFlow = false;
1580 ps.VectorMaskEnable = wm_prog_data->uses_vmask;
1581 /* Wa_1606682166 */
1582 ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
1583 ps.BindingTableEntryCount = fs_bin->bind_map.surface_count;
1584 #if GFX_VER < 20
1585 ps.PushConstantEnable =
1586 devinfo->needs_null_push_constant_tbimr_workaround ||
1587 wm_prog_data->base.nr_params > 0 ||
1588 wm_prog_data->base.ubo_ranges[0].length;
1589 #endif
1590
1591 ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
1592
1593 #if GFX_VERx10 < 125
1594 ps.PerThreadScratchSpace = get_scratch_space(fs_bin);
1595 ps.ScratchSpaceBasePointer =
1596 get_scratch_address(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
1597 #endif
1598
1599 #if GFX_VER >= 30
1600 ps.RegistersPerThread = ptl_register_blocks(wm_prog_data->base.grf_used);
1601 #endif
1602 }
1603 anv_pipeline_emit_merge(pipeline, partial.ps, ps_dwords, GENX(3DSTATE_PS), ps) {
1604 #if GFX_VERx10 >= 125
1605 ps.ScratchSpaceBuffer =
1606 get_scratch_surf(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin, false);
1607 #endif
1608 }
1609 if (pipeline_needs_protected(&pipeline->base.base)) {
1610 anv_pipeline_emit_merge(pipeline, partial.ps_protected,
1611 ps_dwords, GENX(3DSTATE_PS), ps) {
1612 #if GFX_VERx10 >= 125
1613 ps.ScratchSpaceBuffer =
1614 get_scratch_surf(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin, true);
1615 #endif
1616 }
1617 }
1618 }
1619
1620 static void
emit_3dstate_ps_extra(struct anv_graphics_pipeline * pipeline,const struct vk_rasterization_state * rs,const struct vk_graphics_pipeline_state * state)1621 emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
1622 const struct vk_rasterization_state *rs,
1623 const struct vk_graphics_pipeline_state *state)
1624 {
1625 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1626
1627 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1628 anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
1629 return;
1630 }
1631
1632 anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
1633 ps.PixelShaderValid = true;
1634 #if GFX_VER < 20
1635 ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;
1636 #endif
1637 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1638 ps.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
1639 ps.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
1640 ps.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1641
1642 ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
1643 #if GFX_VER >= 20
1644 assert(!wm_prog_data->pulls_bary);
1645 #else
1646 ps.PixelShaderPullsBary = wm_prog_data->pulls_bary;
1647 #endif
1648
1649 ps.InputCoverageMaskState = ICMS_NONE;
1650 assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
1651 if (!wm_prog_data->uses_sample_mask)
1652 ps.InputCoverageMaskState = ICMS_NONE;
1653 else if (brw_wm_prog_data_is_coarse(wm_prog_data, 0))
1654 ps.InputCoverageMaskState = ICMS_NORMAL;
1655 else if (wm_prog_data->post_depth_coverage)
1656 ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
1657 else
1658 ps.InputCoverageMaskState = ICMS_NORMAL;
1659
1660 #if GFX_VER >= 11
1661 ps.PixelShaderRequiresSubpixelSampleOffsets =
1662 wm_prog_data->uses_sample_offsets;
1663 ps.PixelShaderRequiresNonPerspectiveBaryPlaneCoefficients =
1664 wm_prog_data->uses_npc_bary_coefficients;
1665 ps.PixelShaderRequiresPerspectiveBaryPlaneCoefficients =
1666 wm_prog_data->uses_pc_bary_coefficients;
1667 ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
1668 wm_prog_data->uses_depth_w_coefficients;
1669 #endif
1670 }
1671 }
1672
1673 static void
compute_kill_pixel(struct anv_graphics_pipeline * pipeline,const struct vk_multisample_state * ms,const struct vk_graphics_pipeline_state * state)1674 compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
1675 const struct vk_multisample_state *ms,
1676 const struct vk_graphics_pipeline_state *state)
1677 {
1678 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1679 pipeline->kill_pixel = false;
1680 return;
1681 }
1682
1683 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1684
1685 /* This computes the KillPixel portion of the computation for whether or
1686 * not we want to enable the PMA fix on gfx8 or gfx9. It's given by this
1687 * chunk of the giant formula:
1688 *
1689 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1690 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1691 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1692 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1693 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1694 *
1695 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
1696 * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
1697 * of an alpha test.
1698 */
1699 pipeline->rp_has_ds_self_dep =
1700 (state->pipeline_flags &
1701 VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) != 0;
1702 pipeline->kill_pixel =
1703 pipeline->rp_has_ds_self_dep ||
1704 wm_prog_data->uses_kill ||
1705 wm_prog_data->uses_omask ||
1706 (ms && ms->alpha_to_coverage_enable);
1707 }
1708
1709 #if GFX_VER >= 12
1710 static void
emit_3dstate_primitive_replication(struct anv_graphics_pipeline * pipeline,const struct vk_render_pass_state * rp)1711 emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
1712 const struct vk_render_pass_state *rp)
1713 {
1714 if (anv_pipeline_is_mesh(pipeline)) {
1715 anv_pipeline_emit(pipeline, final.primitive_replication,
1716 GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1717 return;
1718 }
1719
1720 const int replication_count =
1721 anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots;
1722
1723 assert(replication_count >= 1);
1724 if (replication_count == 1) {
1725 anv_pipeline_emit(pipeline, final.primitive_replication,
1726 GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1727 return;
1728 }
1729
1730 assert(replication_count == util_bitcount(rp->view_mask));
1731 assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
1732
1733 anv_pipeline_emit(pipeline, final.primitive_replication,
1734 GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
1735 pr.ReplicaMask = (1 << replication_count) - 1;
1736 pr.ReplicationCount = replication_count - 1;
1737
1738 int i = 0;
1739 u_foreach_bit(view_index, rp->view_mask) {
1740 pr.RTAIOffset[i] = view_index;
1741 i++;
1742 }
1743 }
1744 }
1745 #endif
1746
1747 #if GFX_VERx10 >= 125
1748 static void
emit_task_state(struct anv_graphics_pipeline * pipeline)1749 emit_task_state(struct anv_graphics_pipeline *pipeline)
1750 {
1751 assert(anv_pipeline_is_mesh(pipeline));
1752
1753 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
1754 anv_pipeline_emit(pipeline, final.task_control,
1755 GENX(3DSTATE_TASK_CONTROL), zero);
1756 anv_pipeline_emit(pipeline, final.task_control_protected,
1757 GENX(3DSTATE_TASK_CONTROL), zero);
1758 anv_pipeline_emit(pipeline, final.task_shader,
1759 GENX(3DSTATE_TASK_SHADER), zero);
1760 anv_pipeline_emit(pipeline, final.task_redistrib,
1761 GENX(3DSTATE_TASK_REDISTRIB), zero);
1762 return;
1763 }
1764
1765 const struct anv_shader_bin *task_bin =
1766 pipeline->base.shaders[MESA_SHADER_TASK];
1767
1768 uint32_t task_control_dwords[GENX(3DSTATE_TASK_CONTROL_length)];
1769 anv_pipeline_emit_tmp(pipeline, task_control_dwords, GENX(3DSTATE_TASK_CONTROL), tc) {
1770 tc.TaskShaderEnable = true;
1771 tc.StatisticsEnable = true;
1772 tc.MaximumNumberofThreadGroups = 511;
1773 }
1774
1775 anv_pipeline_emit_merge(pipeline, final.task_control,
1776 task_control_dwords, GENX(3DSTATE_TASK_CONTROL), tc) {
1777 tc.ScratchSpaceBuffer =
1778 get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin, false);
1779 }
1780 if (pipeline_needs_protected(&pipeline->base.base)) {
1781 anv_pipeline_emit_merge(pipeline, final.task_control_protected,
1782 task_control_dwords, GENX(3DSTATE_TASK_CONTROL), tc) {
1783 tc.ScratchSpaceBuffer =
1784 get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin, true);
1785 }
1786 }
1787
1788 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1789 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
1790 const struct intel_cs_dispatch_info task_dispatch =
1791 brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
1792
1793 anv_pipeline_emit(pipeline, final.task_shader,
1794 GENX(3DSTATE_TASK_SHADER), task) {
1795 task.KernelStartPointer = task_bin->kernel.offset;
1796 task.SIMDSize = task_dispatch.simd_size / 16;
1797 task.MessageSIMD = task.SIMDSize;
1798 task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads;
1799 task.ExecutionMask = task_dispatch.right_mask;
1800 task.LocalXMaximum = task_dispatch.group_size - 1;
1801 task.EmitLocalIDX = true;
1802
1803 task.NumberofBarriers = task_prog_data->base.uses_barrier;
1804 task.SharedLocalMemorySize =
1805 intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared);
1806 task.PreferredSLMAllocationSize =
1807 intel_compute_preferred_slm_calc_encode_size(devinfo,
1808 task_prog_data->base.base.total_shared,
1809 task_dispatch.group_size,
1810 task_dispatch.simd_size);
1811
1812 /*
1813 * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
1814 * of a buffer with push constants and descriptor set table and
1815 * InlineData[2:7] will be used for first few push constants.
1816 */
1817 task.EmitInlineParameter = true;
1818
1819 task.XP0Required = task_prog_data->uses_drawid;
1820
1821 #if GFX_VER >= 30
1822 task.RegistersPerThread = ptl_register_blocks(task_prog_data->base.base.grf_used);
1823 #endif
1824 }
1825
1826 /* Recommended values from "Task and Mesh Distribution Programming". */
1827 anv_pipeline_emit(pipeline, final.task_redistrib,
1828 GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
1829 redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
1830 redistrib.SmallTaskThreshold = 1; /* 2^N */
1831 redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
1832 redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM;
1833 redistrib.TaskRedistributionMode = TASKREDISTRIB_RR_STRICT;
1834 }
1835 }
1836
1837 static void
emit_mesh_state(struct anv_graphics_pipeline * pipeline)1838 emit_mesh_state(struct anv_graphics_pipeline *pipeline)
1839 {
1840 assert(anv_pipeline_is_mesh(pipeline));
1841
1842 const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];
1843 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
1844
1845 uint32_t mesh_control_dwords[GENX(3DSTATE_MESH_CONTROL_length)];
1846 anv_pipeline_emit_tmp(pipeline, mesh_control_dwords, GENX(3DSTATE_MESH_CONTROL), mc) {
1847 mc.MeshShaderEnable = true;
1848 mc.StatisticsEnable = true;
1849 mc.MaximumNumberofThreadGroups = 511;
1850 #if GFX_VER >= 20
1851 mc.VPandRTAIndexAutostripEnable = mesh_prog_data->autostrip_enable;
1852 #endif
1853 }
1854
1855 anv_pipeline_emit_merge(pipeline, final.mesh_control,
1856 mesh_control_dwords, GENX(3DSTATE_MESH_CONTROL), mc) {
1857 mc.ScratchSpaceBuffer =
1858 get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin, false);
1859 }
1860 if (pipeline_needs_protected(&pipeline->base.base)) {
1861 anv_pipeline_emit_merge(pipeline, final.mesh_control_protected,
1862 mesh_control_dwords, GENX(3DSTATE_MESH_CONTROL), mc) {
1863 mc.ScratchSpaceBuffer =
1864 get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin, true);
1865 }
1866 }
1867
1868 const struct intel_device_info *devinfo = pipeline->base.base.device->info;
1869 const struct intel_cs_dispatch_info mesh_dispatch =
1870 brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL);
1871
1872 const unsigned output_topology =
1873 mesh_prog_data->primitive_type == MESA_PRIM_POINTS ? OUTPUT_POINT :
1874 mesh_prog_data->primitive_type == MESA_PRIM_LINES ? OUTPUT_LINE :
1875 OUTPUT_TRI;
1876
1877 uint32_t index_format;
1878 switch (mesh_prog_data->index_format) {
1879 case BRW_INDEX_FORMAT_U32:
1880 index_format = INDEX_U32;
1881 break;
1882 case BRW_INDEX_FORMAT_U888X:
1883 index_format = INDEX_U888X;
1884 break;
1885 default:
1886 unreachable("invalid index format");
1887 }
1888
1889 anv_pipeline_emit(pipeline, final.mesh_shader,
1890 GENX(3DSTATE_MESH_SHADER), mesh) {
1891 mesh.KernelStartPointer = mesh_bin->kernel.offset;
1892 mesh.SIMDSize = mesh_dispatch.simd_size / 16;
1893 mesh.MessageSIMD = mesh.SIMDSize;
1894 mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads;
1895 mesh.ExecutionMask = mesh_dispatch.right_mask;
1896 mesh.LocalXMaximum = mesh_dispatch.group_size - 1;
1897 mesh.EmitLocalIDX = true;
1898
1899 mesh.MaximumPrimitiveCount = MAX2(mesh_prog_data->map.max_primitives, 1) - 1;
1900 mesh.OutputTopology = output_topology;
1901 mesh.PerVertexDataPitch = mesh_prog_data->map.per_vertex_pitch_dw / 8;
1902 mesh.PerPrimitiveDataPresent = mesh_prog_data->map.per_primitive_pitch_dw > 0;
1903 mesh.PerPrimitiveDataPitch = mesh_prog_data->map.per_primitive_pitch_dw / 8;
1904 mesh.IndexFormat = index_format;
1905
1906 mesh.NumberofBarriers = mesh_prog_data->base.uses_barrier;
1907 mesh.SharedLocalMemorySize =
1908 intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared);
1909 mesh.PreferredSLMAllocationSize =
1910 intel_compute_preferred_slm_calc_encode_size(devinfo,
1911 mesh_prog_data->base.base.total_shared,
1912 mesh_dispatch.group_size,
1913 mesh_dispatch.simd_size);
1914
1915 /*
1916 * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address
1917 * of a buffer with push constants and descriptor set table and
1918 * InlineData[2:7] will be used for first few push constants.
1919 */
1920 mesh.EmitInlineParameter = true;
1921
1922 mesh.XP0Required = mesh_prog_data->uses_drawid;
1923
1924 #if GFX_VER >= 30
1925 mesh.RegistersPerThread = ptl_register_blocks(mesh_prog_data->base.base.grf_used);
1926 #endif
1927 }
1928
1929 /* Recommended values from "Task and Mesh Distribution Programming". */
1930 anv_pipeline_emit(pipeline, final.mesh_distrib,
1931 GENX(3DSTATE_MESH_DISTRIB), distrib) {
1932 distrib.DistributionMode = MESH_RR_FREE;
1933 distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
1934 distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
1935 }
1936 }
1937 #endif
1938
1939 void
genX(graphics_pipeline_emit)1940 genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
1941 const struct vk_graphics_pipeline_state *state)
1942 {
1943 enum intel_urb_deref_block_size urb_deref_block_size;
1944 emit_urb_setup(pipeline, &urb_deref_block_size);
1945
1946 emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp,
1947 urb_deref_block_size);
1948 compute_kill_pixel(pipeline, state->ms, state);
1949
1950 emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs);
1951
1952 #if GFX_VER >= 12
1953 emit_3dstate_primitive_replication(pipeline, state->rp);
1954 #endif
1955
1956 #if GFX_VERx10 >= 125
1957 bool needs_instance_granularity =
1958 intel_needs_workaround(pipeline->base.base.device->info, 14019166699) &&
1959 (sbe_primitive_id_override(pipeline) ||
1960 geom_or_tess_prim_id_used(pipeline));
1961
1962 anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
1963 /* Gfx12.5: If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE */
1964 vfg.DistributionMode =
1965 #if GFX_VER < 20
1966 !anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_FREE :
1967 #endif
1968 RR_STRICT;
1969 vfg.DistributionGranularity = needs_instance_granularity ?
1970 InstanceLevelGranularity : BatchLevelGranularity;
1971 #if INTEL_WA_14014851047_GFX_VER
1972 vfg.GranularityThresholdDisable =
1973 intel_needs_workaround(pipeline->base.base.device->info, 14014851047);
1974 #endif
1975 /* 192 vertices for TRILIST_ADJ */
1976 vfg.ListNBatchSizeScale = 0;
1977 /* Batch size of 384 vertices */
1978 vfg.List3BatchSizeScale = 2;
1979 /* Batch size of 128 vertices */
1980 vfg.List2BatchSizeScale = 1;
1981 /* Batch size of 128 vertices */
1982 vfg.List1BatchSizeScale = 2;
1983 /* Batch size of 256 vertices for STRIP topologies */
1984 vfg.StripBatchSizeScale = 3;
1985 /* 192 control points for PATCHLIST_3 */
1986 vfg.PatchBatchSizeScale = 1;
1987 /* 192 control points for PATCHLIST_3 */
1988 vfg.PatchBatchSizeMultiplier = 31;
1989 }
1990 #endif
1991
1992 if (anv_pipeline_is_primitive(pipeline)) {
1993 emit_vertex_input(pipeline, state, state->vi);
1994
1995 emit_3dstate_vs(pipeline);
1996 emit_3dstate_hs_ds(pipeline, state->ts);
1997 emit_3dstate_te(pipeline);
1998 emit_3dstate_gs(pipeline);
1999
2000 emit_3dstate_streamout(pipeline, state->rs);
2001
2002 #if GFX_VERx10 >= 125
2003 const struct anv_device *device = pipeline->base.base.device;
2004 /* Disable Mesh. */
2005 if (device->vk.enabled_extensions.EXT_mesh_shader) {
2006 anv_pipeline_emit(pipeline, final.mesh_control,
2007 GENX(3DSTATE_MESH_CONTROL), zero);
2008 anv_pipeline_emit(pipeline, final.mesh_control_protected,
2009 GENX(3DSTATE_MESH_CONTROL), zero);
2010 anv_pipeline_emit(pipeline, final.mesh_shader,
2011 GENX(3DSTATE_MESH_SHADER), zero);
2012 anv_pipeline_emit(pipeline, final.mesh_distrib,
2013 GENX(3DSTATE_MESH_DISTRIB), zero);
2014 anv_pipeline_emit(pipeline, final.clip_mesh,
2015 GENX(3DSTATE_CLIP_MESH), zero);
2016 anv_pipeline_emit(pipeline, final.sbe_mesh,
2017 GENX(3DSTATE_SBE_MESH), zero);
2018 anv_pipeline_emit(pipeline, final.task_control,
2019 GENX(3DSTATE_TASK_CONTROL), zero);
2020 anv_pipeline_emit(pipeline, final.task_control_protected,
2021 GENX(3DSTATE_TASK_CONTROL), zero);
2022 anv_pipeline_emit(pipeline, final.task_shader,
2023 GENX(3DSTATE_TASK_SHADER), zero);
2024 anv_pipeline_emit(pipeline, final.task_redistrib,
2025 GENX(3DSTATE_TASK_REDISTRIB), zero);
2026 }
2027 #endif
2028 } else {
2029 assert(anv_pipeline_is_mesh(pipeline));
2030
2031 anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs);
2032 #if GFX_VER >= 11
2033 anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs);
2034 #endif
2035 anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs);
2036 anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
2037 anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
2038 anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te);
2039 anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
2040
2041 anv_pipeline_emit(pipeline, final.vs_protected, GENX(3DSTATE_VS), vs);
2042 anv_pipeline_emit(pipeline, final.hs_protected, GENX(3DSTATE_HS), hs);
2043 anv_pipeline_emit(pipeline, final.ds_protected, GENX(3DSTATE_DS), ds);
2044 anv_pipeline_emit(pipeline, partial.gs_protected, GENX(3DSTATE_GS), gs);
2045
2046 /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
2047 * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
2048 */
2049 anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);
2050
2051 #if GFX_VERx10 >= 125
2052 emit_task_state(pipeline);
2053 emit_mesh_state(pipeline);
2054 #endif
2055 }
2056
2057 emit_3dstate_sbe(pipeline);
2058 emit_3dstate_wm(pipeline, state->ia, state->rs,
2059 state->ms, state->cb, state->rp);
2060 emit_3dstate_ps(pipeline, state->ms, state->cb);
2061 emit_3dstate_ps_extra(pipeline, state->rs, state);
2062 }
2063
2064 #if GFX_VERx10 >= 125
2065
2066 void
genX(compute_pipeline_emit)2067 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
2068 {
2069 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2070 anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2071 }
2072
2073 #else /* #if GFX_VERx10 >= 125 */
2074
2075 void
genX(compute_pipeline_emit)2076 genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
2077 {
2078 struct anv_device *device = pipeline->base.device;
2079 const struct intel_device_info *devinfo = device->info;
2080 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2081
2082 anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2083
2084 const struct intel_cs_dispatch_info dispatch =
2085 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
2086 const uint32_t vfe_curbe_allocation =
2087 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
2088 cs_prog_data->push.cross_thread.regs, 2);
2089
2090 const struct anv_shader_bin *cs_bin = pipeline->cs;
2091
2092 anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
2093 vfe.StackSize = 0;
2094 vfe.MaximumNumberofThreads =
2095 devinfo->max_cs_threads * devinfo->subslice_total - 1;
2096 vfe.NumberofURBEntries = 2;
2097 #if GFX_VER < 11
2098 vfe.ResetGatewayTimer = true;
2099 #endif
2100 vfe.URBEntryAllocationSize = 2;
2101 vfe.CURBEAllocationSize = vfe_curbe_allocation;
2102
2103 if (cs_prog_data->base.total_scratch) {
2104 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
2105 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
2106 */
2107 vfe.PerThreadScratchSpace = ffs(cs_prog_data->base.total_scratch) - 11;
2108 vfe.ScratchSpaceBasePointer =
2109 get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2110 }
2111 }
2112
2113 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
2114 .KernelStartPointer =
2115 cs_bin->kernel.offset +
2116 brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
2117
2118 /* Wa_1606682166 */
2119 .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
2120 /* We add 1 because the CS indirect parameters buffer isn't accounted
2121 * for in bind_map.surface_count.
2122 *
2123 * Typically set to 0 to avoid prefetching on every thread dispatch.
2124 */
2125 .BindingTableEntryCount = devinfo->verx10 == 125 ?
2126 0 : MIN2(pipeline->cs->bind_map.surface_count, 30),
2127 .BarrierEnable = cs_prog_data->uses_barrier,
2128 .SharedLocalMemorySize =
2129 intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared),
2130
2131 .ConstantURBEntryReadOffset = 0,
2132 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2133 .CrossThreadConstantDataReadLength =
2134 cs_prog_data->push.cross_thread.regs,
2135 #if GFX_VER >= 12
2136 /* TODO: Check if we are missing workarounds and enable mid-thread
2137 * preemption.
2138 *
2139 * We still have issues with mid-thread preemption (it was already
2140 * disabled by the kernel on gfx11, due to missing workarounds). It's
2141 * possible that we are just missing some workarounds, and could enable
2142 * it later, but for now let's disable it to fix a GPU in compute in Car
2143 * Chase (and possibly more).
2144 */
2145 .ThreadPreemptionDisable = true,
2146 #endif
2147
2148 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2149 };
2150 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
2151 pipeline->interface_descriptor_data,
2152 &desc);
2153 }
2154
2155 #endif /* #if GFX_VERx10 >= 125 */
2156
2157 #if GFX_VERx10 >= 125
2158
2159 void
genX(ray_tracing_pipeline_emit)2160 genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
2161 {
2162 for (uint32_t i = 0; i < pipeline->group_count; i++) {
2163 struct anv_rt_shader_group *group = &pipeline->groups[i];
2164
2165 switch (group->type) {
2166 case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
2167 struct GENX(RT_GENERAL_SBT_HANDLE) sh = {};
2168 sh.General = anv_shader_bin_get_bsr(group->general, 32);
2169 GENX(RT_GENERAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2170 break;
2171 }
2172
2173 case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
2174 struct GENX(RT_TRIANGLES_SBT_HANDLE) sh = {};
2175 if (group->closest_hit)
2176 sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2177 if (group->any_hit)
2178 sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
2179 GENX(RT_TRIANGLES_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2180 break;
2181 }
2182
2183 case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
2184 struct GENX(RT_PROCEDURAL_SBT_HANDLE) sh = {};
2185 if (group->closest_hit)
2186 sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2187 sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
2188 GENX(RT_PROCEDURAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
2189 break;
2190 }
2191
2192 default:
2193 unreachable("Invalid shader group type");
2194 }
2195 }
2196 }
2197
2198 #else
2199
2200 void
genX(ray_tracing_pipeline_emit)2201 genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
2202 {
2203 unreachable("Ray tracing not supported");
2204 }
2205
2206 #endif /* GFX_VERx10 >= 125 */
2207