1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28 #include "genxml/gen_rt_pack.h"
29
30 #include "common/intel_l3_config.h"
31 #include "common/intel_sample_positions.h"
32 #include "nir/nir_xfb_info.h"
33 #include "vk_util.h"
34 #include "vk_format.h"
35 #include "vk_log.h"
36
37 static uint32_t
vertex_element_comp_control(enum isl_format format,unsigned comp)38 vertex_element_comp_control(enum isl_format format, unsigned comp)
39 {
40 uint8_t bits;
41 switch (comp) {
42 case 0: bits = isl_format_layouts[format].channels.r.bits; break;
43 case 1: bits = isl_format_layouts[format].channels.g.bits; break;
44 case 2: bits = isl_format_layouts[format].channels.b.bits; break;
45 case 3: bits = isl_format_layouts[format].channels.a.bits; break;
46 default: unreachable("Invalid component");
47 }
48
49 /*
50 * Take in account hardware restrictions when dealing with 64-bit floats.
51 *
52 * From Broadwell spec, command reference structures, page 586:
53 * "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
54 * 64-bit components are stored * in the URB without any conversion. In
55 * this case, vertex elements must be written as 128 or 256 bits, with
56 * VFCOMP_STORE_0 being used to pad the output as required. E.g., if
57 * R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
58 * Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
59 * set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
60 * Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
61 * a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
62 * Component 3 to be specified as VFCOMP_STORE_0 in order to output a
63 * 256-bit vertex element."
64 */
65 if (bits) {
66 return VFCOMP_STORE_SRC;
67 } else if (comp >= 2 &&
68 !isl_format_layouts[format].channels.b.bits &&
69 isl_format_layouts[format].channels.r.type == ISL_RAW) {
70 /* When emitting 64-bit attributes, we need to write either 128 or 256
71 * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
72 * VFCOMP_STORE_0 to pad the written chunk */
73 return VFCOMP_NOSTORE;
74 } else if (comp < 3 ||
75 isl_format_layouts[format].channels.r.type == ISL_RAW) {
76 /* Note we need to pad with value 0, not 1, due hardware restrictions
77 * (see comment above) */
78 return VFCOMP_STORE_0;
79 } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
80 isl_format_layouts[format].channels.r.type == ISL_SINT) {
81 assert(comp == 3);
82 return VFCOMP_STORE_1_INT;
83 } else {
84 assert(comp == 3);
85 return VFCOMP_STORE_1_FP;
86 }
87 }
88
89 static void
emit_vertex_input(struct anv_graphics_pipeline * pipeline,const VkPipelineVertexInputStateCreateInfo * info)90 emit_vertex_input(struct anv_graphics_pipeline *pipeline,
91 const VkPipelineVertexInputStateCreateInfo *info)
92 {
93 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
94
95 /* Pull inputs_read out of the VS prog data */
96 const uint64_t inputs_read = vs_prog_data->inputs_read;
97 const uint64_t double_inputs_read =
98 vs_prog_data->double_inputs_read & inputs_read;
99 assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
100 const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
101 const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
102 const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||
103 vs_prog_data->uses_instanceid ||
104 vs_prog_data->uses_firstvertex ||
105 vs_prog_data->uses_baseinstance;
106
107 uint32_t elem_count = __builtin_popcount(elements) -
108 __builtin_popcount(elements_double) / 2;
109
110 const uint32_t total_elems =
111 MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid);
112
113 uint32_t *p;
114
115 const uint32_t num_dwords = 1 + total_elems * 2;
116 p = anv_batch_emitn(&pipeline->base.batch, num_dwords,
117 GENX(3DSTATE_VERTEX_ELEMENTS));
118 if (!p)
119 return;
120
121 for (uint32_t i = 0; i < total_elems; i++) {
122 /* The SKL docs for VERTEX_ELEMENT_STATE say:
123 *
124 * "All elements must be valid from Element[0] to the last valid
125 * element. (I.e. if Element[2] is valid then Element[1] and
126 * Element[0] must also be valid)."
127 *
128 * The SKL docs for 3D_Vertex_Component_Control say:
129 *
130 * "Don't store this component. (Not valid for Component 0, but can
131 * be used for Component 1-3)."
132 *
133 * So we can't just leave a vertex element blank and hope for the best.
134 * We have to tell the VF hardware to put something in it; so we just
135 * store a bunch of zero.
136 *
137 * TODO: Compact vertex elements so we never end up with holes.
138 */
139 struct GENX(VERTEX_ELEMENT_STATE) element = {
140 .Valid = true,
141 .Component0Control = VFCOMP_STORE_0,
142 .Component1Control = VFCOMP_STORE_0,
143 .Component2Control = VFCOMP_STORE_0,
144 .Component3Control = VFCOMP_STORE_0,
145 };
146 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element);
147 }
148
149 for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
150 const VkVertexInputAttributeDescription *desc =
151 &info->pVertexAttributeDescriptions[i];
152 enum isl_format format = anv_get_isl_format(&pipeline->base.device->info,
153 desc->format,
154 VK_IMAGE_ASPECT_COLOR_BIT,
155 VK_IMAGE_TILING_LINEAR);
156
157 assert(desc->binding < MAX_VBS);
158
159 if ((elements & (1 << desc->location)) == 0)
160 continue; /* Binding unused */
161
162 uint32_t slot =
163 __builtin_popcount(elements & ((1 << desc->location) - 1)) -
164 DIV_ROUND_UP(__builtin_popcount(elements_double &
165 ((1 << desc->location) -1)), 2);
166
167 struct GENX(VERTEX_ELEMENT_STATE) element = {
168 .VertexBufferIndex = desc->binding,
169 .Valid = true,
170 .SourceElementFormat = format,
171 .EdgeFlagEnable = false,
172 .SourceElementOffset = desc->offset,
173 .Component0Control = vertex_element_comp_control(format, 0),
174 .Component1Control = vertex_element_comp_control(format, 1),
175 .Component2Control = vertex_element_comp_control(format, 2),
176 .Component3Control = vertex_element_comp_control(format, 3),
177 };
178 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
179
180 #if GFX_VER >= 8
181 /* On Broadwell and later, we have a separate VF_INSTANCING packet
182 * that controls instancing. On Haswell and prior, that's part of
183 * VERTEX_BUFFER_STATE which we emit later.
184 */
185 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
186 vfi.InstancingEnable = pipeline->vb[desc->binding].instanced;
187 vfi.VertexElementIndex = slot;
188 vfi.InstanceDataStepRate =
189 pipeline->vb[desc->binding].instance_divisor;
190 }
191 #endif
192 }
193
194 const uint32_t id_slot = elem_count;
195 if (needs_svgs_elem) {
196 /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
197 * "Within a VERTEX_ELEMENT_STATE structure, if a Component
198 * Control field is set to something other than VFCOMP_STORE_SRC,
199 * no higher-numbered Component Control fields may be set to
200 * VFCOMP_STORE_SRC"
201 *
202 * This means, that if we have BaseInstance, we need BaseVertex as
203 * well. Just do all or nothing.
204 */
205 uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
206 vs_prog_data->uses_baseinstance) ?
207 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
208
209 struct GENX(VERTEX_ELEMENT_STATE) element = {
210 .VertexBufferIndex = ANV_SVGS_VB_INDEX,
211 .Valid = true,
212 .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
213 .Component0Control = base_ctrl,
214 .Component1Control = base_ctrl,
215 #if GFX_VER >= 8
216 .Component2Control = VFCOMP_STORE_0,
217 .Component3Control = VFCOMP_STORE_0,
218 #else
219 .Component2Control = VFCOMP_STORE_VID,
220 .Component3Control = VFCOMP_STORE_IID,
221 #endif
222 };
223 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);
224
225 #if GFX_VER >= 8
226 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
227 vfi.VertexElementIndex = id_slot;
228 }
229 #endif
230 }
231
232 #if GFX_VER >= 8
233 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) {
234 sgvs.VertexIDEnable = vs_prog_data->uses_vertexid;
235 sgvs.VertexIDComponentNumber = 2;
236 sgvs.VertexIDElementOffset = id_slot;
237 sgvs.InstanceIDEnable = vs_prog_data->uses_instanceid;
238 sgvs.InstanceIDComponentNumber = 3;
239 sgvs.InstanceIDElementOffset = id_slot;
240 }
241 #endif
242
243 const uint32_t drawid_slot = elem_count + needs_svgs_elem;
244 if (vs_prog_data->uses_drawid) {
245 struct GENX(VERTEX_ELEMENT_STATE) element = {
246 .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
247 .Valid = true,
248 .SourceElementFormat = ISL_FORMAT_R32_UINT,
249 .Component0Control = VFCOMP_STORE_SRC,
250 .Component1Control = VFCOMP_STORE_0,
251 .Component2Control = VFCOMP_STORE_0,
252 .Component3Control = VFCOMP_STORE_0,
253 };
254 GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
255 &p[1 + drawid_slot * 2],
256 &element);
257
258 #if GFX_VER >= 8
259 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
260 vfi.VertexElementIndex = drawid_slot;
261 }
262 #endif
263 }
264 }
265
266 void
genX(emit_urb_setup)267 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
268 const struct intel_l3_config *l3_config,
269 VkShaderStageFlags active_stages,
270 const unsigned entry_size[4],
271 enum intel_urb_deref_block_size *deref_block_size)
272 {
273 const struct intel_device_info *devinfo = &device->info;
274
275 unsigned entries[4];
276 unsigned start[4];
277 bool constrained;
278 intel_get_urb_config(devinfo, l3_config,
279 active_stages &
280 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
281 active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
282 entry_size, entries, start, deref_block_size,
283 &constrained);
284
285 #if GFX_VERx10 == 70
286 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
287 *
288 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
289 * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
290 * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
291 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL
292 * needs to be sent before any combination of VS associated 3DSTATE."
293 */
294 anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) {
295 pc.DepthStallEnable = true;
296 pc.PostSyncOperation = WriteImmediateData;
297 pc.Address = device->workaround_address;
298 }
299 #endif
300
301 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
302 anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
303 urb._3DCommandSubOpcode += i;
304 urb.VSURBStartingAddress = start[i];
305 urb.VSURBEntryAllocationSize = entry_size[i] - 1;
306 urb.VSNumberofURBEntries = entries[i];
307 }
308 }
309 }
310
311 static void
emit_urb_setup(struct anv_graphics_pipeline * pipeline,enum intel_urb_deref_block_size * deref_block_size)312 emit_urb_setup(struct anv_graphics_pipeline *pipeline,
313 enum intel_urb_deref_block_size *deref_block_size)
314 {
315 unsigned entry_size[4];
316 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
317 const struct brw_vue_prog_data *prog_data =
318 !anv_pipeline_has_stage(pipeline, i) ? NULL :
319 (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;
320
321 entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
322 }
323
324 genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch,
325 pipeline->base.l3_config,
326 pipeline->active_stages, entry_size,
327 deref_block_size);
328 }
329
330 static void
emit_3dstate_sbe(struct anv_graphics_pipeline * pipeline)331 emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
332 {
333 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
334
335 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
336 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe);
337 #if GFX_VER >= 8
338 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe);
339 #endif
340 return;
341 }
342
343 struct GENX(3DSTATE_SBE) sbe = {
344 GENX(3DSTATE_SBE_header),
345 .AttributeSwizzleEnable = true,
346 .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
347 .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
348 .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
349 };
350
351 #if GFX_VER >= 9
352 for (unsigned i = 0; i < 32; i++)
353 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
354 #endif
355
356 #if GFX_VER >= 8
357 /* On Broadwell, they broke 3DSTATE_SBE into two packets */
358 struct GENX(3DSTATE_SBE_SWIZ) swiz = {
359 GENX(3DSTATE_SBE_SWIZ_header),
360 };
361 #else
362 # define swiz sbe
363 #endif
364
365 if (anv_pipeline_is_primitive(pipeline)) {
366 const struct brw_vue_map *fs_input_map =
367 &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
368
369 int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
370 fs_input_map);
371 assert(first_slot % 2 == 0);
372 unsigned urb_entry_read_offset = first_slot / 2;
373 int max_source_attr = 0;
374 for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
375 uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
376 int input_index = wm_prog_data->urb_setup[attr];
377
378 assert(0 <= input_index);
379
380 /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
381 * VUE header
382 */
383 if (attr == VARYING_SLOT_VIEWPORT ||
384 attr == VARYING_SLOT_LAYER ||
385 attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
386 continue;
387 }
388
389 if (attr == VARYING_SLOT_PNTC) {
390 sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
391 continue;
392 }
393
394 const int slot = fs_input_map->varying_to_slot[attr];
395
396 if (slot == -1) {
397 /* This attribute does not exist in the VUE--that means that the
398 * vertex shader did not write to it. It could be that it's a
399 * regular varying read by the fragment shader but not written by
400 * the vertex shader or it's gl_PrimitiveID. In the first case the
401 * value is undefined, in the second it needs to be
402 * gl_PrimitiveID.
403 */
404 swiz.Attribute[input_index].ConstantSource = PRIM_ID;
405 swiz.Attribute[input_index].ComponentOverrideX = true;
406 swiz.Attribute[input_index].ComponentOverrideY = true;
407 swiz.Attribute[input_index].ComponentOverrideZ = true;
408 swiz.Attribute[input_index].ComponentOverrideW = true;
409 continue;
410 }
411
412 /* We have to subtract two slots to accout for the URB entry output
413 * read offset in the VS and GS stages.
414 */
415 const int source_attr = slot - 2 * urb_entry_read_offset;
416 assert(source_attr >= 0 && source_attr < 32);
417 max_source_attr = MAX2(max_source_attr, source_attr);
418 /* The hardware can only do overrides on 16 overrides at a time, and the
419 * other up to 16 have to be lined up so that the input index = the
420 * output index. We'll need to do some tweaking to make sure that's the
421 * case.
422 */
423 if (input_index < 16)
424 swiz.Attribute[input_index].SourceAttribute = source_attr;
425 else
426 assert(source_attr == input_index);
427 }
428
429 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
430 sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
431 #if GFX_VER >= 8
432 sbe.ForceVertexURBEntryReadOffset = true;
433 sbe.ForceVertexURBEntryReadLength = true;
434 #endif
435 }
436
437 uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch,
438 GENX(3DSTATE_SBE_length));
439 if (!dw)
440 return;
441 GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe);
442
443 #if GFX_VER >= 8
444 dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length));
445 if (!dw)
446 return;
447 GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz);
448 #endif
449 }
450
451 /** Returns the final polygon mode for rasterization
452 *
453 * This function takes into account polygon mode, primitive topology and the
454 * different shader stages which might generate their own type of primitives.
455 */
456 VkPolygonMode
genX(raster_polygon_mode)457 genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
458 VkPrimitiveTopology primitive_topology)
459 {
460 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
461 switch (get_gs_prog_data(pipeline)->output_topology) {
462 case _3DPRIM_POINTLIST:
463 return VK_POLYGON_MODE_POINT;
464
465 case _3DPRIM_LINELIST:
466 case _3DPRIM_LINESTRIP:
467 case _3DPRIM_LINELOOP:
468 return VK_POLYGON_MODE_LINE;
469
470 case _3DPRIM_TRILIST:
471 case _3DPRIM_TRIFAN:
472 case _3DPRIM_TRISTRIP:
473 case _3DPRIM_RECTLIST:
474 case _3DPRIM_QUADLIST:
475 case _3DPRIM_QUADSTRIP:
476 case _3DPRIM_POLYGON:
477 return pipeline->polygon_mode;
478 }
479 unreachable("Unsupported GS output topology");
480 } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
481 switch (get_tes_prog_data(pipeline)->output_topology) {
482 case BRW_TESS_OUTPUT_TOPOLOGY_POINT:
483 return VK_POLYGON_MODE_POINT;
484
485 case BRW_TESS_OUTPUT_TOPOLOGY_LINE:
486 return VK_POLYGON_MODE_LINE;
487
488 case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW:
489 case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
490 return pipeline->polygon_mode;
491 }
492 unreachable("Unsupported TCS output topology");
493 } else {
494 switch (primitive_topology) {
495 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
496 return VK_POLYGON_MODE_POINT;
497
498 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
499 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
500 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
501 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
502 return VK_POLYGON_MODE_LINE;
503
504 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
505 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
506 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
507 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
508 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
509 return pipeline->polygon_mode;
510
511 default:
512 unreachable("Unsupported primitive topology");
513 }
514 }
515 }
516
517 uint32_t
genX(ms_rasterization_mode)518 genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
519 VkPolygonMode raster_mode)
520 {
521 #if GFX_VER <= 7
522 if (raster_mode == VK_POLYGON_MODE_LINE) {
523 switch (pipeline->line_mode) {
524 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
525 return MSRASTMODE_ON_PATTERN;
526
527 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
528 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
529 return MSRASTMODE_OFF_PIXEL;
530
531 default:
532 unreachable("Unsupported line rasterization mode");
533 }
534 } else {
535 return pipeline->rasterization_samples > 1 ?
536 MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
537 }
538 #else
539 unreachable("Only on gen7");
540 #endif
541 }
542
543 static VkProvokingVertexModeEXT
vk_provoking_vertex_mode(const VkPipelineRasterizationStateCreateInfo * rs_info)544 vk_provoking_vertex_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
545 {
546 const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
547 vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
548
549 return rs_pv_info == NULL ? VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT :
550 rs_pv_info->provokingVertexMode;
551 }
552
553 const uint32_t genX(vk_to_intel_cullmode)[] = {
554 [VK_CULL_MODE_NONE] = CULLMODE_NONE,
555 [VK_CULL_MODE_FRONT_BIT] = CULLMODE_FRONT,
556 [VK_CULL_MODE_BACK_BIT] = CULLMODE_BACK,
557 [VK_CULL_MODE_FRONT_AND_BACK] = CULLMODE_BOTH
558 };
559
560 const uint32_t genX(vk_to_intel_fillmode)[] = {
561 [VK_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
562 [VK_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
563 [VK_POLYGON_MODE_POINT] = FILL_MODE_POINT,
564 };
565
566 const uint32_t genX(vk_to_intel_front_face)[] = {
567 [VK_FRONT_FACE_COUNTER_CLOCKWISE] = 1,
568 [VK_FRONT_FACE_CLOCKWISE] = 0
569 };
570
571 #if GFX_VER >= 9
572 static VkConservativeRasterizationModeEXT
vk_conservative_rasterization_mode(const VkPipelineRasterizationStateCreateInfo * rs_info)573 vk_conservative_rasterization_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
574 {
575 const VkPipelineRasterizationConservativeStateCreateInfoEXT *cr =
576 vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
577
578 return cr ? cr->conservativeRasterizationMode :
579 VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
580 }
581 #endif
582
583 void
genX(rasterization_mode)584 genX(rasterization_mode)(VkPolygonMode raster_mode,
585 VkLineRasterizationModeEXT line_mode,
586 float line_width,
587 uint32_t *api_mode,
588 bool *msaa_rasterization_enable)
589 {
590 #if GFX_VER >= 8
591 if (raster_mode == VK_POLYGON_MODE_LINE) {
592 /* Unfortunately, configuring our line rasterization hardware on gfx8
593 * and later is rather painful. Instead of giving us bits to tell the
594 * hardware what line mode to use like we had on gfx7, we now have an
595 * arcane combination of API Mode and MSAA enable bits which do things
596 * in a table which are expected to magically put the hardware into the
597 * right mode for your API. Sadly, Vulkan isn't any of the APIs the
598 * hardware people thought of so nothing works the way you want it to.
599 *
600 * Look at the table titled "Multisample Rasterization Modes" in Vol 7
601 * of the Skylake PRM for more details.
602 */
603 switch (line_mode) {
604 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
605 *api_mode = DX100;
606 #if GFX_VER <= 9
607 /* Prior to ICL, the algorithm the HW uses to draw wide lines
608 * doesn't quite match what the CTS expects, at least for rectangular
609 * lines, so we set this to false here, making it draw parallelograms
610 * instead, which work well enough.
611 */
612 *msaa_rasterization_enable = line_width < 1.0078125;
613 #else
614 *msaa_rasterization_enable = true;
615 #endif
616 break;
617
618 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
619 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
620 *api_mode = DX9OGL;
621 *msaa_rasterization_enable = false;
622 break;
623
624 default:
625 unreachable("Unsupported line rasterization mode");
626 }
627 } else {
628 *api_mode = DX100;
629 *msaa_rasterization_enable = true;
630 }
631 #else
632 unreachable("Invalid call");
633 #endif
634 }
635
636 static void
emit_rs_state(struct anv_graphics_pipeline * pipeline,const VkPipelineInputAssemblyStateCreateInfo * ia_info,const VkPipelineRasterizationStateCreateInfo * rs_info,const VkPipelineMultisampleStateCreateInfo * ms_info,const VkPipelineRasterizationLineStateCreateInfoEXT * line_info,const uint32_t dynamic_states,const struct anv_render_pass * pass,const struct anv_subpass * subpass,enum intel_urb_deref_block_size urb_deref_block_size)637 emit_rs_state(struct anv_graphics_pipeline *pipeline,
638 const VkPipelineInputAssemblyStateCreateInfo *ia_info,
639 const VkPipelineRasterizationStateCreateInfo *rs_info,
640 const VkPipelineMultisampleStateCreateInfo *ms_info,
641 const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
642 const uint32_t dynamic_states,
643 const struct anv_render_pass *pass,
644 const struct anv_subpass *subpass,
645 enum intel_urb_deref_block_size urb_deref_block_size)
646 {
647 struct GENX(3DSTATE_SF) sf = {
648 GENX(3DSTATE_SF_header),
649 };
650
651 sf.ViewportTransformEnable = true;
652 sf.StatisticsEnable = true;
653 sf.VertexSubPixelPrecisionSelect = _8Bit;
654 sf.AALineDistanceMode = true;
655
656 switch (vk_provoking_vertex_mode(rs_info)) {
657 case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
658 sf.TriangleStripListProvokingVertexSelect = 0;
659 sf.LineStripListProvokingVertexSelect = 0;
660 sf.TriangleFanProvokingVertexSelect = 1;
661 break;
662
663 case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
664 sf.TriangleStripListProvokingVertexSelect = 2;
665 sf.LineStripListProvokingVertexSelect = 1;
666 sf.TriangleFanProvokingVertexSelect = 2;
667 break;
668
669 default:
670 unreachable("Invalid provoking vertex mode");
671 }
672
673 #if GFX_VERx10 == 75
674 sf.LineStippleEnable = line_info && line_info->stippledLineEnable;
675 #endif
676
677 #if GFX_VER >= 12
678 sf.DerefBlockSize = urb_deref_block_size;
679 #endif
680
681 if (anv_pipeline_is_primitive(pipeline)) {
682 const struct brw_vue_prog_data *last_vue_prog_data =
683 anv_pipeline_get_last_vue_prog_data(pipeline);
684
685 if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
686 sf.PointWidthSource = Vertex;
687 } else {
688 sf.PointWidthSource = State;
689 sf.PointWidth = 1.0;
690 }
691 }
692
693 #if GFX_VER >= 8
694 struct GENX(3DSTATE_RASTER) raster = {
695 GENX(3DSTATE_RASTER_header),
696 };
697 #else
698 # define raster sf
699 #endif
700
701 VkPolygonMode raster_mode =
702 genX(raster_polygon_mode)(pipeline, ia_info->topology);
703 bool dynamic_primitive_topology =
704 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
705
706 /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
707 * "Multisample Modes State".
708 */
709 #if GFX_VER >= 8
710 if (!dynamic_primitive_topology)
711 genX(rasterization_mode)(raster_mode, pipeline->line_mode,
712 rs_info->lineWidth,
713 &raster.APIMode,
714 &raster.DXMultisampleRasterizationEnable);
715
716 /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
717 * computations. If we ever set this bit to a different value, they will
718 * need to be updated accordingly.
719 */
720 raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
721 raster.ForceMultisampling = false;
722 #else
723 uint32_t ms_rast_mode = 0;
724
725 if (!dynamic_primitive_topology)
726 ms_rast_mode = genX(ms_rasterization_mode)(pipeline, raster_mode);
727
728 raster.MultisampleRasterizationMode = ms_rast_mode;
729 #endif
730
731 raster.AntialiasingEnable =
732 dynamic_primitive_topology ? 0 :
733 anv_rasterization_aa_mode(raster_mode, pipeline->line_mode);
734
735 raster.FrontWinding =
736 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE ?
737 0 : genX(vk_to_intel_front_face)[rs_info->frontFace];
738 raster.CullMode =
739 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE ?
740 0 : genX(vk_to_intel_cullmode)[rs_info->cullMode];
741
742 raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
743 raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
744 raster.ScissorRectangleEnable = true;
745
746 #if GFX_VER >= 9
747 /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */
748 raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;
749 raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;
750 #elif GFX_VER >= 8
751 raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;
752 #endif
753
754 #if GFX_VER >= 9
755 raster.ConservativeRasterizationEnable =
756 vk_conservative_rasterization_mode(rs_info) !=
757 VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
758 #endif
759
760 bool depth_bias_enable =
761 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE ?
762 0 : rs_info->depthBiasEnable;
763
764 raster.GlobalDepthOffsetEnableSolid = depth_bias_enable;
765 raster.GlobalDepthOffsetEnableWireframe = depth_bias_enable;
766 raster.GlobalDepthOffsetEnablePoint = depth_bias_enable;
767
768 #if GFX_VER == 7
769 /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it
770 * can get the depth offsets correct.
771 */
772 if (subpass->depth_stencil_attachment) {
773 VkFormat vk_format =
774 pass->attachments[subpass->depth_stencil_attachment->attachment].format;
775 assert(vk_format_is_depth_or_stencil(vk_format));
776 if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {
777 enum isl_format isl_format =
778 anv_get_isl_format(&pipeline->base.device->info, vk_format,
779 VK_IMAGE_ASPECT_DEPTH_BIT,
780 VK_IMAGE_TILING_OPTIMAL);
781 sf.DepthBufferSurfaceFormat =
782 isl_format_get_depth_format(isl_format, false);
783 }
784 }
785 #endif
786
787 #if GFX_VER >= 8
788 GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf);
789 GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster);
790 #else
791 # undef raster
792 GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf);
793 #endif
794 }
795
796 static void
emit_ms_state(struct anv_graphics_pipeline * pipeline,const VkPipelineMultisampleStateCreateInfo * info,uint32_t dynamic_states)797 emit_ms_state(struct anv_graphics_pipeline *pipeline,
798 const VkPipelineMultisampleStateCreateInfo *info,
799 uint32_t dynamic_states)
800 {
801 /* Only lookup locations if the extensions is active, otherwise the default
802 * ones will be used either at device initialization time or through
803 * 3DSTATE_MULTISAMPLE on Gfx7/7.5 by passing NULL locations.
804 */
805 if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations) {
806 /* If the sample locations are dynamic, 3DSTATE_MULTISAMPLE on Gfx7/7.5
807 * will be emitted dynamically, so skip it here. On Gfx8+
808 * 3DSTATE_SAMPLE_PATTERN will be emitted dynamically, so skip it here.
809 */
810 if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)) {
811 #if GFX_VER >= 8
812 genX(emit_sample_pattern)(&pipeline->base.batch,
813 pipeline->dynamic_state.sample_locations.samples,
814 pipeline->dynamic_state.sample_locations.locations);
815 #endif
816 }
817
818 genX(emit_multisample)(&pipeline->base.batch,
819 pipeline->dynamic_state.sample_locations.samples,
820 pipeline->dynamic_state.sample_locations.locations);
821 } else {
822 /* On Gfx8+ 3DSTATE_MULTISAMPLE does not hold anything we need to modify
823 * for sample locations, so we don't have to emit it dynamically.
824 */
825 #if GFX_VER >= 8
826 genX(emit_multisample)(&pipeline->base.batch,
827 info ? info->rasterizationSamples : 1,
828 NULL);
829 #endif
830 }
831
832 /* From the Vulkan 1.0 spec:
833 * If pSampleMask is NULL, it is treated as if the mask has all bits
834 * enabled, i.e. no coverage is removed from fragments.
835 *
836 * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
837 */
838 #if GFX_VER >= 8
839 uint32_t sample_mask = 0xffff;
840 #else
841 uint32_t sample_mask = 0xff;
842 #endif
843
844 if (info && info->pSampleMask)
845 sample_mask &= info->pSampleMask[0];
846
847 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
848 sm.SampleMask = sample_mask;
849 }
850
851 pipeline->cps_state = ANV_STATE_NULL;
852 #if GFX_VER >= 11
853 if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) &&
854 pipeline->base.device->vk.enabled_extensions.KHR_fragment_shading_rate) {
855 #if GFX_VER >= 12
856 struct anv_device *device = pipeline->base.device;
857 const uint32_t num_dwords =
858 GENX(CPS_STATE_length) * 4 * pipeline->dynamic_state.viewport.count;
859 pipeline->cps_state =
860 anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords, 32);
861 #endif
862
863 genX(emit_shading_rate)(&pipeline->base.batch,
864 pipeline,
865 pipeline->cps_state,
866 &pipeline->dynamic_state);
867 }
868 #endif
869 }
870
871 const uint32_t genX(vk_to_intel_logic_op)[] = {
872 [VK_LOGIC_OP_COPY] = LOGICOP_COPY,
873 [VK_LOGIC_OP_CLEAR] = LOGICOP_CLEAR,
874 [VK_LOGIC_OP_AND] = LOGICOP_AND,
875 [VK_LOGIC_OP_AND_REVERSE] = LOGICOP_AND_REVERSE,
876 [VK_LOGIC_OP_AND_INVERTED] = LOGICOP_AND_INVERTED,
877 [VK_LOGIC_OP_NO_OP] = LOGICOP_NOOP,
878 [VK_LOGIC_OP_XOR] = LOGICOP_XOR,
879 [VK_LOGIC_OP_OR] = LOGICOP_OR,
880 [VK_LOGIC_OP_NOR] = LOGICOP_NOR,
881 [VK_LOGIC_OP_EQUIVALENT] = LOGICOP_EQUIV,
882 [VK_LOGIC_OP_INVERT] = LOGICOP_INVERT,
883 [VK_LOGIC_OP_OR_REVERSE] = LOGICOP_OR_REVERSE,
884 [VK_LOGIC_OP_COPY_INVERTED] = LOGICOP_COPY_INVERTED,
885 [VK_LOGIC_OP_OR_INVERTED] = LOGICOP_OR_INVERTED,
886 [VK_LOGIC_OP_NAND] = LOGICOP_NAND,
887 [VK_LOGIC_OP_SET] = LOGICOP_SET,
888 };
889
890 static const uint32_t vk_to_intel_blend[] = {
891 [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO,
892 [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE,
893 [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR,
894 [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR,
895 [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR,
896 [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR,
897 [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA,
898 [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA,
899 [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA,
900 [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA,
901 [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR,
902 [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
903 [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA,
904 [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
905 [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE,
906 [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR,
907 [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR,
908 [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA,
909 [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA,
910 };
911
912 static const uint32_t vk_to_intel_blend_op[] = {
913 [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD,
914 [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT,
915 [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT,
916 [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN,
917 [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX,
918 };
919
920 const uint32_t genX(vk_to_intel_compare_op)[] = {
921 [VK_COMPARE_OP_NEVER] = PREFILTEROP_NEVER,
922 [VK_COMPARE_OP_LESS] = PREFILTEROP_LESS,
923 [VK_COMPARE_OP_EQUAL] = PREFILTEROP_EQUAL,
924 [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LEQUAL,
925 [VK_COMPARE_OP_GREATER] = PREFILTEROP_GREATER,
926 [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_NOTEQUAL,
927 [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GEQUAL,
928 [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_ALWAYS,
929 };
930
931 const uint32_t genX(vk_to_intel_stencil_op)[] = {
932 [VK_STENCIL_OP_KEEP] = STENCILOP_KEEP,
933 [VK_STENCIL_OP_ZERO] = STENCILOP_ZERO,
934 [VK_STENCIL_OP_REPLACE] = STENCILOP_REPLACE,
935 [VK_STENCIL_OP_INCREMENT_AND_CLAMP] = STENCILOP_INCRSAT,
936 [VK_STENCIL_OP_DECREMENT_AND_CLAMP] = STENCILOP_DECRSAT,
937 [VK_STENCIL_OP_INVERT] = STENCILOP_INVERT,
938 [VK_STENCIL_OP_INCREMENT_AND_WRAP] = STENCILOP_INCR,
939 [VK_STENCIL_OP_DECREMENT_AND_WRAP] = STENCILOP_DECR,
940 };
941
942 const uint32_t genX(vk_to_intel_primitive_type)[] = {
943 [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = _3DPRIM_POINTLIST,
944 [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = _3DPRIM_LINELIST,
945 [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = _3DPRIM_LINESTRIP,
946 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = _3DPRIM_TRILIST,
947 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
948 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
949 [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
950 [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
951 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
952 [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
953 };
954
955 /* This function sanitizes the VkStencilOpState by looking at the compare ops
956 * and trying to determine whether or not a given stencil op can ever actually
957 * occur. Stencil ops which can never occur are set to VK_STENCIL_OP_KEEP.
958 * This function returns true if, after sanitation, any of the stencil ops are
959 * set to something other than VK_STENCIL_OP_KEEP.
960 */
961 static bool
sanitize_stencil_face(VkStencilOpState * face,VkCompareOp depthCompareOp)962 sanitize_stencil_face(VkStencilOpState *face,
963 VkCompareOp depthCompareOp)
964 {
965 /* If compareOp is ALWAYS then the stencil test will never fail and failOp
966 * will never happen. Set failOp to KEEP in this case.
967 */
968 if (face->compareOp == VK_COMPARE_OP_ALWAYS)
969 face->failOp = VK_STENCIL_OP_KEEP;
970
971 /* If compareOp is NEVER or depthCompareOp is NEVER then one of the depth
972 * or stencil tests will fail and passOp will never happen.
973 */
974 if (face->compareOp == VK_COMPARE_OP_NEVER ||
975 depthCompareOp == VK_COMPARE_OP_NEVER)
976 face->passOp = VK_STENCIL_OP_KEEP;
977
978 /* If compareOp is NEVER or depthCompareOp is ALWAYS then either the
979 * stencil test will fail or the depth test will pass. In either case,
980 * depthFailOp will never happen.
981 */
982 if (face->compareOp == VK_COMPARE_OP_NEVER ||
983 depthCompareOp == VK_COMPARE_OP_ALWAYS)
984 face->depthFailOp = VK_STENCIL_OP_KEEP;
985
986 return face->failOp != VK_STENCIL_OP_KEEP ||
987 face->depthFailOp != VK_STENCIL_OP_KEEP ||
988 face->passOp != VK_STENCIL_OP_KEEP;
989 }
990
991 /* Intel hardware is fairly sensitive to whether or not depth/stencil writes
992 * are enabled. In the presence of discards, it's fairly easy to get into the
993 * non-promoted case which means a fairly big performance hit. From the Iron
994 * Lake PRM, Vol 2, pt. 1, section 8.4.3.2, "Early Depth Test Cases":
995 *
996 * "Non-promoted depth (N) is active whenever the depth test can be done
997 * early but it cannot determine whether or not to write source depth to
998 * the depth buffer, therefore the depth write must be performed post pixel
999 * shader. This includes cases where the pixel shader can kill pixels,
1000 * including via sampler chroma key, as well as cases where the alpha test
1001 * function is enabled, which kills pixels based on a programmable alpha
1002 * test. In this case, even if the depth test fails, the pixel cannot be
1003 * killed if a stencil write is indicated. Whether or not the stencil write
1004 * happens depends on whether or not the pixel is killed later. In these
1005 * cases if stencil test fails and stencil writes are off, the pixels can
1006 * also be killed early. If stencil writes are enabled, the pixels must be
1007 * treated as Computed depth (described above)."
1008 *
1009 * The same thing as mentioned in the stencil case can happen in the depth
1010 * case as well if it thinks it writes depth but, thanks to the depth test
1011 * being GL_EQUAL, the write doesn't actually matter. A little extra work
1012 * up-front to try and disable depth and stencil writes can make a big
1013 * difference.
1014 *
1015 * Unfortunately, the way depth and stencil testing is specified, there are
1016 * many case where, regardless of depth/stencil writes being enabled, nothing
1017 * actually gets written due to some other bit of state being set. This
1018 * function attempts to "sanitize" the depth stencil state and disable writes
1019 * and sometimes even testing whenever possible.
1020 */
1021 static void
sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo * state,bool * stencilWriteEnable,VkImageAspectFlags ds_aspects)1022 sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo *state,
1023 bool *stencilWriteEnable,
1024 VkImageAspectFlags ds_aspects)
1025 {
1026 *stencilWriteEnable = state->stencilTestEnable;
1027
1028 /* If the depth test is disabled, we won't be writing anything. Make sure we
1029 * treat the test as always passing later on as well.
1030 *
1031 * Also, the Vulkan spec requires that if either depth or stencil is not
1032 * present, the pipeline is to act as if the test silently passes. In that
1033 * case we won't write either.
1034 */
1035 if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
1036 state->depthWriteEnable = false;
1037 state->depthCompareOp = VK_COMPARE_OP_ALWAYS;
1038 }
1039
1040 if (!(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
1041 *stencilWriteEnable = false;
1042 state->front.compareOp = VK_COMPARE_OP_ALWAYS;
1043 state->back.compareOp = VK_COMPARE_OP_ALWAYS;
1044 }
1045
1046 /* If the stencil test is enabled and always fails, then we will never get
1047 * to the depth test so we can just disable the depth test entirely.
1048 */
1049 if (state->stencilTestEnable &&
1050 state->front.compareOp == VK_COMPARE_OP_NEVER &&
1051 state->back.compareOp == VK_COMPARE_OP_NEVER) {
1052 state->depthTestEnable = false;
1053 state->depthWriteEnable = false;
1054 }
1055
1056 /* If depthCompareOp is EQUAL then the value we would be writing to the
1057 * depth buffer is the same as the value that's already there so there's no
1058 * point in writing it.
1059 */
1060 if (state->depthCompareOp == VK_COMPARE_OP_EQUAL)
1061 state->depthWriteEnable = false;
1062
1063 /* If the stencil ops are such that we don't actually ever modify the
1064 * stencil buffer, we should disable writes.
1065 */
1066 if (!sanitize_stencil_face(&state->front, state->depthCompareOp) &&
1067 !sanitize_stencil_face(&state->back, state->depthCompareOp))
1068 *stencilWriteEnable = false;
1069
1070 /* If the depth test always passes and we never write out depth, that's the
1071 * same as if the depth test is disabled entirely.
1072 */
1073 if (state->depthCompareOp == VK_COMPARE_OP_ALWAYS &&
1074 !state->depthWriteEnable)
1075 state->depthTestEnable = false;
1076
1077 /* If the stencil test always passes and we never write out stencil, that's
1078 * the same as if the stencil test is disabled entirely.
1079 */
1080 if (state->front.compareOp == VK_COMPARE_OP_ALWAYS &&
1081 state->back.compareOp == VK_COMPARE_OP_ALWAYS &&
1082 !*stencilWriteEnable)
1083 state->stencilTestEnable = false;
1084 }
1085
1086 static void
emit_ds_state(struct anv_graphics_pipeline * pipeline,const VkPipelineDepthStencilStateCreateInfo * pCreateInfo,const uint32_t dynamic_states,const struct anv_render_pass * pass,const struct anv_subpass * subpass)1087 emit_ds_state(struct anv_graphics_pipeline *pipeline,
1088 const VkPipelineDepthStencilStateCreateInfo *pCreateInfo,
1089 const uint32_t dynamic_states,
1090 const struct anv_render_pass *pass,
1091 const struct anv_subpass *subpass)
1092 {
1093 #if GFX_VER == 7
1094 # define depth_stencil_dw pipeline->gfx7.depth_stencil_state
1095 #elif GFX_VER == 8
1096 # define depth_stencil_dw pipeline->gfx8.wm_depth_stencil
1097 #else
1098 # define depth_stencil_dw pipeline->gfx9.wm_depth_stencil
1099 #endif
1100
1101 if (pCreateInfo == NULL) {
1102 /* We're going to OR this together with the dynamic state. We need
1103 * to make sure it's initialized to something useful.
1104 */
1105 pipeline->writes_stencil = false;
1106 pipeline->stencil_test_enable = false;
1107 pipeline->writes_depth = false;
1108 pipeline->depth_test_enable = false;
1109 pipeline->depth_bounds_test_enable = false;
1110 memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
1111 return;
1112 }
1113
1114 VkImageAspectFlags ds_aspects = 0;
1115 if (subpass->depth_stencil_attachment) {
1116 VkFormat depth_stencil_format =
1117 pass->attachments[subpass->depth_stencil_attachment->attachment].format;
1118 ds_aspects = vk_format_aspects(depth_stencil_format);
1119 }
1120
1121 VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
1122 sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);
1123 pipeline->stencil_test_enable = info.stencilTestEnable;
1124 pipeline->writes_depth = info.depthWriteEnable;
1125 pipeline->depth_test_enable = info.depthTestEnable;
1126 pipeline->depth_bounds_test_enable = info.depthBoundsTestEnable;
1127
1128 bool dynamic_stencil_op =
1129 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1130
1131 #if GFX_VER <= 7
1132 struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
1133 #else
1134 struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = {
1135 #endif
1136 .DepthTestEnable =
1137 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE ?
1138 0 : info.depthTestEnable,
1139
1140 .DepthBufferWriteEnable =
1141 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE ?
1142 0 : info.depthWriteEnable,
1143
1144 .DepthTestFunction =
1145 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP ?
1146 0 : genX(vk_to_intel_compare_op)[info.depthCompareOp],
1147
1148 .DoubleSidedStencilEnable = true,
1149
1150 .StencilTestEnable =
1151 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE ?
1152 0 : info.stencilTestEnable,
1153
1154 .StencilFailOp = genX(vk_to_intel_stencil_op)[info.front.failOp],
1155 .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.front.passOp],
1156 .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.front.depthFailOp],
1157 .StencilTestFunction = genX(vk_to_intel_compare_op)[info.front.compareOp],
1158 .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[info.back.failOp],
1159 .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.back.passOp],
1160 .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.back.depthFailOp],
1161 .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[info.back.compareOp],
1162 };
1163
1164 if (dynamic_stencil_op) {
1165 depth_stencil.StencilFailOp = 0;
1166 depth_stencil.StencilPassDepthPassOp = 0;
1167 depth_stencil.StencilPassDepthFailOp = 0;
1168 depth_stencil.StencilTestFunction = 0;
1169 depth_stencil.BackfaceStencilFailOp = 0;
1170 depth_stencil.BackfaceStencilPassDepthPassOp = 0;
1171 depth_stencil.BackfaceStencilPassDepthFailOp = 0;
1172 depth_stencil.BackfaceStencilTestFunction = 0;
1173 }
1174
1175 #if GFX_VER <= 7
1176 GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
1177 #else
1178 GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);
1179 #endif
1180 }
1181
1182 static bool
1183 is_dual_src_blend_factor(VkBlendFactor factor)
1184 {
1185 return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
1186 factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
1187 factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
1188 factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
1189 }
1190
1191 static inline uint32_t *
1192 write_disabled_blend(uint32_t *state)
1193 {
1194 struct GENX(BLEND_STATE_ENTRY) entry = {
1195 .WriteDisableAlpha = true,
1196 .WriteDisableRed = true,
1197 .WriteDisableGreen = true,
1198 .WriteDisableBlue = true,
1199 };
1200 GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry);
1201 return state + GENX(BLEND_STATE_ENTRY_length);
1202 }
1203
1204 static void
1205 emit_cb_state(struct anv_graphics_pipeline *pipeline,
1206 const VkPipelineColorBlendStateCreateInfo *info,
1207 const VkPipelineMultisampleStateCreateInfo *ms_info,
1208 uint32_t dynamic_states)
1209 {
1210 struct anv_device *device = pipeline->base.device;
1211 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1212
1213 struct GENX(BLEND_STATE) blend_state = {
1214 #if GFX_VER >= 8
1215 .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
1216 .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
1217 #endif
1218 };
1219
1220 uint32_t surface_count = 0;
1221 struct anv_pipeline_bind_map *map;
1222 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1223 map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
1224 surface_count = map->surface_count;
1225 }
1226
1227 const uint32_t num_dwords = GENX(BLEND_STATE_length) +
1228 GENX(BLEND_STATE_ENTRY_length) * surface_count;
1229 uint32_t *blend_state_start, *state_pos;
1230
1231 if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1232 ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
1233 const struct intel_device_info *devinfo = &pipeline->base.device->info;
1234 blend_state_start = devinfo->ver >= 8 ?
1235 pipeline->gfx8.blend_state : pipeline->gfx7.blend_state;
1236 pipeline->blend_state = ANV_STATE_NULL;
1237 } else {
1238 pipeline->blend_state =
1239 anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
1240 blend_state_start = pipeline->blend_state.map;
1241 }
1242 state_pos = blend_state_start;
1243
1244 bool has_writeable_rt = false;
1245 state_pos += GENX(BLEND_STATE_length);
1246 #if GFX_VER >= 8
1247 struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
1248 #endif
1249 for (unsigned i = 0; i < surface_count; i++) {
1250 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
1251
1252 /* All color attachments are at the beginning of the binding table */
1253 if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
1254 break;
1255
1256 /* We can have at most 8 attachments */
1257 assert(i < MAX_RTS);
1258
1259 if (info == NULL || binding->index >= info->attachmentCount) {
1260 state_pos = write_disabled_blend(state_pos);
1261 continue;
1262 }
1263
1264 if ((pipeline->dynamic_state.color_writes & (1u << binding->index)) == 0) {
1265 state_pos = write_disabled_blend(state_pos);
1266 continue;
1267 }
1268
1269 const VkPipelineColorBlendAttachmentState *a =
1270 &info->pAttachments[binding->index];
1271
1272 struct GENX(BLEND_STATE_ENTRY) entry = {
1273 #if GFX_VER < 8
1274 .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
1275 .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
1276 #endif
1277 .LogicOpEnable = info->logicOpEnable,
1278 .LogicOpFunction = dynamic_states & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP ?
1279 0: genX(vk_to_intel_logic_op)[info->logicOp],
1280
1281 /* Vulkan specification 1.2.168, VkLogicOp:
1282 *
1283 * "Logical operations are controlled by the logicOpEnable and
1284 * logicOp members of VkPipelineColorBlendStateCreateInfo. If
1285 * logicOpEnable is VK_TRUE, then a logical operation selected by
1286 * logicOp is applied between each color attachment and the
1287 * fragment’s corresponding output value, and blending of all
1288 * attachments is treated as if it were disabled."
1289 *
1290 * From the Broadwell PRM Volume 2d: Command Reference: Structures:
1291 * BLEND_STATE_ENTRY:
1292 *
1293 * "Enabling LogicOp and Color Buffer Blending at the same time is
1294 * UNDEFINED"
1295 */
1296 .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,
1297 .ColorClampRange = COLORCLAMP_RTFORMAT,
1298 .PreBlendColorClampEnable = true,
1299 .PostBlendColorClampEnable = true,
1300 .SourceBlendFactor = vk_to_intel_blend[a->srcColorBlendFactor],
1301 .DestinationBlendFactor = vk_to_intel_blend[a->dstColorBlendFactor],
1302 .ColorBlendFunction = vk_to_intel_blend_op[a->colorBlendOp],
1303 .SourceAlphaBlendFactor = vk_to_intel_blend[a->srcAlphaBlendFactor],
1304 .DestinationAlphaBlendFactor = vk_to_intel_blend[a->dstAlphaBlendFactor],
1305 .AlphaBlendFunction = vk_to_intel_blend_op[a->alphaBlendOp],
1306 .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
1307 .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
1308 .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
1309 .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
1310 };
1311
1312 if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
1313 a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
1314 a->colorBlendOp != a->alphaBlendOp) {
1315 #if GFX_VER >= 8
1316 blend_state.IndependentAlphaBlendEnable = true;
1317 #else
1318 entry.IndependentAlphaBlendEnable = true;
1319 #endif
1320 }
1321
1322 /* The Dual Source Blending documentation says:
1323 *
1324 * "If SRC1 is included in a src/dst blend factor and
1325 * a DualSource RT Write message is not used, results
1326 * are UNDEFINED. (This reflects the same restriction in DX APIs,
1327 * where undefined results are produced if “o1” is not written
1328 * by a PS – there are no default values defined)."
1329 *
1330 * There is no way to gracefully fix this undefined situation
1331 * so we just disable the blending to prevent possible issues.
1332 */
1333 if (!wm_prog_data->dual_src_blend &&
1334 (is_dual_src_blend_factor(a->srcColorBlendFactor) ||
1335 is_dual_src_blend_factor(a->dstColorBlendFactor) ||
1336 is_dual_src_blend_factor(a->srcAlphaBlendFactor) ||
1337 is_dual_src_blend_factor(a->dstAlphaBlendFactor))) {
1338 vk_logw(VK_LOG_OBJS(&device->vk.base),
1339 "Enabled dual-src blend factors without writing both targets "
1340 "in the shader. Disabling blending to avoid GPU hangs.");
1341 entry.ColorBufferBlendEnable = false;
1342 }
1343
1344 if (a->colorWriteMask != 0)
1345 has_writeable_rt = true;
1346
1347 /* Our hardware applies the blend factor prior to the blend function
1348 * regardless of what function is used. Technically, this means the
1349 * hardware can do MORE than GL or Vulkan specify. However, it also
1350 * means that, for MIN and MAX, we have to stomp the blend factor to
1351 * ONE to make it a no-op.
1352 */
1353 if (a->colorBlendOp == VK_BLEND_OP_MIN ||
1354 a->colorBlendOp == VK_BLEND_OP_MAX) {
1355 entry.SourceBlendFactor = BLENDFACTOR_ONE;
1356 entry.DestinationBlendFactor = BLENDFACTOR_ONE;
1357 }
1358 if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
1359 a->alphaBlendOp == VK_BLEND_OP_MAX) {
1360 entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
1361 entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
1362 }
1363 GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
1364 state_pos += GENX(BLEND_STATE_ENTRY_length);
1365 #if GFX_VER >= 8
1366 if (i == 0)
1367 bs0 = entry;
1368 #endif
1369 }
1370
1371 #if GFX_VER >= 8
1372 struct GENX(3DSTATE_PS_BLEND) blend = {
1373 GENX(3DSTATE_PS_BLEND_header),
1374 };
1375 blend.AlphaToCoverageEnable = blend_state.AlphaToCoverageEnable;
1376 blend.HasWriteableRT = has_writeable_rt;
1377 blend.ColorBufferBlendEnable = bs0.ColorBufferBlendEnable;
1378 blend.SourceAlphaBlendFactor = bs0.SourceAlphaBlendFactor;
1379 blend.DestinationAlphaBlendFactor = bs0.DestinationAlphaBlendFactor;
1380 blend.SourceBlendFactor = bs0.SourceBlendFactor;
1381 blend.DestinationBlendFactor = bs0.DestinationBlendFactor;
1382 blend.AlphaTestEnable = false;
1383 blend.IndependentAlphaBlendEnable = blend_state.IndependentAlphaBlendEnable;
1384
1385 if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1386 ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
1387 GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend);
1388 } else {
1389 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), _blend)
1390 _blend = blend;
1391 }
1392 #else
1393 (void)has_writeable_rt;
1394 #endif
1395
1396 GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state);
1397
1398 if (!(dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1399 ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP))) {
1400 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
1401 bsp.BlendStatePointer = pipeline->blend_state.offset;
1402 #if GFX_VER >= 8
1403 bsp.BlendStatePointerValid = true;
1404 #endif
1405 }
1406 }
1407 }
1408
1409 static void
1410 emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
1411 const VkPipelineInputAssemblyStateCreateInfo *ia_info,
1412 const VkPipelineViewportStateCreateInfo *vp_info,
1413 const VkPipelineRasterizationStateCreateInfo *rs_info,
1414 const uint32_t dynamic_states)
1415 {
1416 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1417 (void) wm_prog_data;
1418
1419 struct GENX(3DSTATE_CLIP) clip = {
1420 GENX(3DSTATE_CLIP_header),
1421 };
1422
1423 clip.ClipEnable = true;
1424 clip.StatisticsEnable = true;
1425 clip.EarlyCullEnable = true;
1426 clip.APIMode = APIMODE_D3D;
1427 clip.GuardbandClipTestEnable = true;
1428
1429 /* Only enable the XY clip test when the final polygon rasterization
1430 * mode is VK_POLYGON_MODE_FILL. We want to leave it disabled for
1431 * points and lines so we get "pop-free" clipping.
1432 */
1433 VkPolygonMode raster_mode =
1434 genX(raster_polygon_mode)(pipeline, ia_info->topology);
1435 clip.ViewportXYClipTestEnable =
1436 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ?
1437 0 : (raster_mode == VK_POLYGON_MODE_FILL);
1438
1439 #if GFX_VER >= 8
1440 clip.VertexSubPixelPrecisionSelect = _8Bit;
1441 #endif
1442 clip.ClipMode = CLIPMODE_NORMAL;
1443
1444 switch (vk_provoking_vertex_mode(rs_info)) {
1445 case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
1446 clip.TriangleStripListProvokingVertexSelect = 0;
1447 clip.LineStripListProvokingVertexSelect = 0;
1448 clip.TriangleFanProvokingVertexSelect = 1;
1449 break;
1450
1451 case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
1452 clip.TriangleStripListProvokingVertexSelect = 2;
1453 clip.LineStripListProvokingVertexSelect = 1;
1454 clip.TriangleFanProvokingVertexSelect = 2;
1455 break;
1456
1457 default:
1458 unreachable("Invalid provoking vertex mode");
1459 }
1460
1461 clip.MinimumPointWidth = 0.125;
1462 clip.MaximumPointWidth = 255.875;
1463
1464 if (anv_pipeline_is_primitive(pipeline)) {
1465 const struct brw_vue_prog_data *last =
1466 anv_pipeline_get_last_vue_prog_data(pipeline);
1467
1468 /* From the Vulkan 1.0.45 spec:
1469 *
1470 * "If the last active vertex processing stage shader entry point's
1471 * interface does not include a variable decorated with
1472 * ViewportIndex, then the first viewport is used."
1473 */
1474 if (vp_info && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
1475 clip.MaximumVPIndex = vp_info->viewportCount > 0 ?
1476 vp_info->viewportCount - 1 : 0;
1477 } else {
1478 clip.MaximumVPIndex = 0;
1479 }
1480
1481 /* From the Vulkan 1.0.45 spec:
1482 *
1483 * "If the last active vertex processing stage shader entry point's
1484 * interface does not include a variable decorated with Layer, then
1485 * the first layer is used."
1486 */
1487 clip.ForceZeroRTAIndexEnable =
1488 !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
1489
1490 #if GFX_VER == 7
1491 clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;
1492 clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;
1493 #endif
1494 }
1495
1496 #if GFX_VER == 7
1497 clip.FrontWinding = genX(vk_to_intel_front_face)[rs_info->frontFace];
1498 clip.CullMode = genX(vk_to_intel_cullmode)[rs_info->cullMode];
1499 clip.ViewportZClipTestEnable = pipeline->depth_clip_enable;
1500 #else
1501 clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
1502 (wm_prog_data->barycentric_interp_modes &
1503 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0;
1504 #endif
1505
1506 GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip);
1507 }
1508
1509 static void
1510 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
1511 const VkPipelineRasterizationStateCreateInfo *rs_info,
1512 const uint32_t dynamic_states)
1513 {
1514 const struct brw_vue_prog_data *prog_data =
1515 anv_pipeline_get_last_vue_prog_data(pipeline);
1516 const struct brw_vue_map *vue_map = &prog_data->vue_map;
1517
1518 nir_xfb_info *xfb_info;
1519 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1520 xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;
1521 else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
1522 xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
1523 else
1524 xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;
1525
1526 #if GFX_VER == 7
1527 # define streamout_state_dw pipeline->gfx7.streamout_state
1528 #else
1529 # define streamout_state_dw pipeline->gfx8.streamout_state
1530 #endif
1531
1532 struct GENX(3DSTATE_STREAMOUT) so = {
1533 GENX(3DSTATE_STREAMOUT_header),
1534 .RenderingDisable =
1535 (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) ?
1536 0 : rs_info->rasterizerDiscardEnable,
1537 };
1538
1539 if (xfb_info) {
1540 so.SOFunctionEnable = true;
1541 so.SOStatisticsEnable = true;
1542
1543 switch (vk_provoking_vertex_mode(rs_info)) {
1544 case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
1545 so.ReorderMode = LEADING;
1546 break;
1547
1548 case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
1549 so.ReorderMode = TRAILING;
1550 break;
1551
1552 default:
1553 unreachable("Invalid provoking vertex mode");
1554 }
1555
1556 const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
1557 vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
1558 so.RenderStreamSelect = stream_info ?
1559 stream_info->rasterizationStream : 0;
1560
1561 #if GFX_VER >= 8
1562 so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
1563 so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
1564 so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
1565 so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
1566 #else
1567 pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride;
1568 pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride;
1569 pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride;
1570 pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride;
1571
1572 /* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which
1573 * is a bit inconvenient because we don't know what buffers will
1574 * actually be enabled until draw time. We do our best here by
1575 * setting them based on buffers_written and we disable them
1576 * as-needed at draw time by setting EndAddress = BaseAddress.
1577 */
1578 so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0);
1579 so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1);
1580 so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2);
1581 so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3);
1582 #endif
1583
1584 int urb_entry_read_offset = 0;
1585 int urb_entry_read_length =
1586 (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
1587
1588 /* We always read the whole vertex. This could be reduced at some
1589 * point by reading less and offsetting the register index in the
1590 * SO_DECLs.
1591 */
1592 so.Stream0VertexReadOffset = urb_entry_read_offset;
1593 so.Stream0VertexReadLength = urb_entry_read_length - 1;
1594 so.Stream1VertexReadOffset = urb_entry_read_offset;
1595 so.Stream1VertexReadLength = urb_entry_read_length - 1;
1596 so.Stream2VertexReadOffset = urb_entry_read_offset;
1597 so.Stream2VertexReadLength = urb_entry_read_length - 1;
1598 so.Stream3VertexReadOffset = urb_entry_read_offset;
1599 so.Stream3VertexReadLength = urb_entry_read_length - 1;
1600 }
1601
1602 if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
1603 GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so);
1604 } else {
1605 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), _so)
1606 _so = so;
1607 }
1608
1609 if (xfb_info) {
1610 struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
1611 int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
1612 int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
1613
1614 memset(so_decl, 0, sizeof(so_decl));
1615
1616 for (unsigned i = 0; i < xfb_info->output_count; i++) {
1617 const nir_xfb_output_info *output = &xfb_info->outputs[i];
1618 unsigned buffer = output->buffer;
1619 unsigned stream = xfb_info->buffer_to_stream[buffer];
1620
1621 /* Our hardware is unusual in that it requires us to program SO_DECLs
1622 * for fake "hole" components, rather than simply taking the offset
1623 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
1624 * program as many size = 4 holes as we can, then a final hole to
1625 * accommodate the final 1, 2, or 3 remaining.
1626 */
1627 int hole_dwords = (output->offset - next_offset[buffer]) / 4;
1628 while (hole_dwords > 0) {
1629 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1630 .HoleFlag = 1,
1631 .OutputBufferSlot = buffer,
1632 .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
1633 };
1634 hole_dwords -= 4;
1635 }
1636
1637 int varying = output->location;
1638 uint8_t component_mask = output->component_mask;
1639 /* VARYING_SLOT_PSIZ contains four scalar fields packed together:
1640 * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x
1641 * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y
1642 * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z
1643 * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w
1644 */
1645 if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
1646 varying = VARYING_SLOT_PSIZ;
1647 component_mask = 1 << 0; // SO_DECL_COMPMASK_X
1648 } else if (varying == VARYING_SLOT_LAYER) {
1649 varying = VARYING_SLOT_PSIZ;
1650 component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
1651 } else if (varying == VARYING_SLOT_VIEWPORT) {
1652 varying = VARYING_SLOT_PSIZ;
1653 component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
1654 } else if (varying == VARYING_SLOT_PSIZ) {
1655 component_mask = 1 << 3; // SO_DECL_COMPMASK_W
1656 }
1657
1658 next_offset[buffer] = output->offset +
1659 __builtin_popcount(component_mask) * 4;
1660
1661 const int slot = vue_map->varying_to_slot[varying];
1662 if (slot < 0) {
1663 /* This can happen if the shader never writes to the varying.
1664 * Insert a hole instead of actual varying data.
1665 */
1666 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1667 .HoleFlag = true,
1668 .OutputBufferSlot = buffer,
1669 .ComponentMask = component_mask,
1670 };
1671 } else {
1672 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1673 .OutputBufferSlot = buffer,
1674 .RegisterIndex = slot,
1675 .ComponentMask = component_mask,
1676 };
1677 }
1678 }
1679
1680 int max_decls = 0;
1681 for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
1682 max_decls = MAX2(max_decls, decls[s]);
1683
1684 uint8_t sbs[MAX_XFB_STREAMS] = { };
1685 for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
1686 if (xfb_info->buffers_written & (1 << b))
1687 sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
1688 }
1689
1690 uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls,
1691 GENX(3DSTATE_SO_DECL_LIST),
1692 .StreamtoBufferSelects0 = sbs[0],
1693 .StreamtoBufferSelects1 = sbs[1],
1694 .StreamtoBufferSelects2 = sbs[2],
1695 .StreamtoBufferSelects3 = sbs[3],
1696 .NumEntries0 = decls[0],
1697 .NumEntries1 = decls[1],
1698 .NumEntries2 = decls[2],
1699 .NumEntries3 = decls[3]);
1700
1701 for (int i = 0; i < max_decls; i++) {
1702 GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
1703 &(struct GENX(SO_DECL_ENTRY)) {
1704 .Stream0Decl = so_decl[0][i],
1705 .Stream1Decl = so_decl[1][i],
1706 .Stream2Decl = so_decl[2][i],
1707 .Stream3Decl = so_decl[3][i],
1708 });
1709 }
1710 }
1711 }
1712
1713 static uint32_t
1714 get_sampler_count(const struct anv_shader_bin *bin)
1715 {
1716 uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4);
1717
1718 /* We can potentially have way more than 32 samplers and that's ok.
1719 * However, the 3DSTATE_XS packets only have 3 bits to specify how
1720 * many to pre-fetch and all values above 4 are marked reserved.
1721 */
1722 return MIN2(count_by_4, 4);
1723 }
1724
1725 static UNUSED struct anv_address
1726 get_scratch_address(struct anv_pipeline *pipeline,
1727 gl_shader_stage stage,
1728 const struct anv_shader_bin *bin)
1729 {
1730 return (struct anv_address) {
1731 .bo = anv_scratch_pool_alloc(pipeline->device,
1732 &pipeline->device->scratch_pool,
1733 stage, bin->prog_data->total_scratch),
1734 .offset = 0,
1735 };
1736 }
1737
1738 static UNUSED uint32_t
1739 get_scratch_space(const struct anv_shader_bin *bin)
1740 {
1741 return ffs(bin->prog_data->total_scratch / 2048);
1742 }
1743
1744 static UNUSED uint32_t
1745 get_scratch_surf(struct anv_pipeline *pipeline,
1746 gl_shader_stage stage,
1747 const struct anv_shader_bin *bin)
1748 {
1749 if (bin->prog_data->total_scratch == 0)
1750 return 0;
1751
1752 struct anv_bo *bo =
1753 anv_scratch_pool_alloc(pipeline->device,
1754 &pipeline->device->scratch_pool,
1755 stage, bin->prog_data->total_scratch);
1756 anv_reloc_list_add_bo(pipeline->batch.relocs,
1757 pipeline->batch.alloc, bo);
1758 return anv_scratch_pool_get_surf(pipeline->device,
1759 &pipeline->device->scratch_pool,
1760 bin->prog_data->total_scratch) >> 4;
1761 }
1762
1763 static void
1764 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
1765 {
1766 const struct intel_device_info *devinfo = &pipeline->base.device->info;
1767 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1768 const struct anv_shader_bin *vs_bin =
1769 pipeline->shaders[MESA_SHADER_VERTEX];
1770
1771 assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
1772
1773 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) {
1774 vs.Enable = true;
1775 vs.StatisticsEnable = true;
1776 vs.KernelStartPointer = vs_bin->kernel.offset;
1777 #if GFX_VER >= 8
1778 vs.SIMD8DispatchEnable =
1779 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
1780 #endif
1781
1782 assert(!vs_prog_data->base.base.use_alt_mode);
1783 #if GFX_VER < 11
1784 vs.SingleVertexDispatch = false;
1785 #endif
1786 vs.VectorMaskEnable = false;
1787 /* Wa_1606682166:
1788 * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
1789 * Disable the Sampler state prefetch functionality in the SARB by
1790 * programming 0xB000[30] to '1'.
1791 */
1792 vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
1793 vs.BindingTableEntryCount = vs_bin->bind_map.surface_count;
1794 vs.FloatingPointMode = IEEE754;
1795 vs.IllegalOpcodeExceptionEnable = false;
1796 vs.SoftwareExceptionEnable = false;
1797 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
1798
1799 if (GFX_VER == 9 && devinfo->gt == 4 &&
1800 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1801 /* On Sky Lake GT4, we have experienced some hangs related to the VS
1802 * cache and tessellation. It is unknown exactly what is happening
1803 * but the Haswell docs for the "VS Reference Count Full Force Miss
1804 * Enable" field of the "Thread Mode" register refer to a HSW bug in
1805 * which the VUE handle reference count would overflow resulting in
1806 * internal reference counting bugs. My (Jason's) best guess is that
1807 * this bug cropped back up on SKL GT4 when we suddenly had more
1808 * threads in play than any previous gfx9 hardware.
1809 *
1810 * What we do know for sure is that setting this bit when
1811 * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
1812 * City when playing with DXVK (https://bugs.freedesktop.org/107280).
1813 * Disabling the vertex cache with tessellation shaders should only
1814 * have a minor performance impact as the tessellation shaders are
1815 * likely generating and processing far more geometry than the vertex
1816 * stage.
1817 */
1818 vs.VertexCacheDisable = true;
1819 }
1820
1821 vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length;
1822 vs.VertexURBEntryReadOffset = 0;
1823 vs.DispatchGRFStartRegisterForURBData =
1824 vs_prog_data->base.base.dispatch_grf_start_reg;
1825
1826 #if GFX_VER >= 8
1827 vs.UserClipDistanceClipTestEnableBitmask =
1828 vs_prog_data->base.clip_distance_mask;
1829 vs.UserClipDistanceCullTestEnableBitmask =
1830 vs_prog_data->base.cull_distance_mask;
1831 #endif
1832
1833 #if GFX_VERx10 >= 125
1834 vs.ScratchSpaceBuffer =
1835 get_scratch_surf(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
1836 #else
1837 vs.PerThreadScratchSpace = get_scratch_space(vs_bin);
1838 vs.ScratchSpaceBasePointer =
1839 get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
1840 #endif
1841 }
1842 }
1843
1844 static void
1845 emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
1846 const VkPipelineTessellationStateCreateInfo *tess_info)
1847 {
1848 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1849 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs);
1850 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te);
1851 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds);
1852 return;
1853 }
1854
1855 const struct intel_device_info *devinfo = &pipeline->base.device->info;
1856 const struct anv_shader_bin *tcs_bin =
1857 pipeline->shaders[MESA_SHADER_TESS_CTRL];
1858 const struct anv_shader_bin *tes_bin =
1859 pipeline->shaders[MESA_SHADER_TESS_EVAL];
1860
1861 const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1862 const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1863
1864 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) {
1865 hs.Enable = true;
1866 hs.StatisticsEnable = true;
1867 hs.KernelStartPointer = tcs_bin->kernel.offset;
1868 /* Wa_1606682166 */
1869 hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
1870 hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
1871
1872 #if GFX_VER >= 12
1873 /* Wa_1604578095:
1874 *
1875 * Hang occurs when the number of max threads is less than 2 times
1876 * the number of instance count. The number of max threads must be
1877 * more than 2 times the number of instance count.
1878 */
1879 assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
1880 #endif
1881
1882 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1883 hs.IncludeVertexHandles = true;
1884 hs.InstanceCount = tcs_prog_data->instances - 1;
1885
1886 hs.VertexURBEntryReadLength = 0;
1887 hs.VertexURBEntryReadOffset = 0;
1888 hs.DispatchGRFStartRegisterForURBData =
1889 tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
1890 #if GFX_VER >= 12
1891 hs.DispatchGRFStartRegisterForURBData5 =
1892 tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
1893 #endif
1894
1895 #if GFX_VERx10 >= 125
1896 hs.ScratchSpaceBuffer =
1897 get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
1898 #else
1899 hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1900 hs.ScratchSpaceBasePointer =
1901 get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
1902 #endif
1903
1904 #if GFX_VER == 12
1905 /* Patch Count threshold specifies the maximum number of patches that
1906 * will be accumulated before a thread dispatch is forced.
1907 */
1908 hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
1909 #endif
1910
1911 #if GFX_VER >= 9
1912 hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
1913 hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
1914 #endif
1915 }
1916
1917 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
1918 tess_info ? vk_find_struct_const(tess_info, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO) : NULL;
1919
1920 VkTessellationDomainOrigin uv_origin =
1921 domain_origin_state ? domain_origin_state->domainOrigin :
1922 VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
1923
1924 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {
1925 te.Partitioning = tes_prog_data->partitioning;
1926
1927 if (uv_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
1928 te.OutputTopology = tes_prog_data->output_topology;
1929 } else {
1930 /* When the origin is upper-left, we have to flip the winding order */
1931 if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
1932 te.OutputTopology = OUTPUT_TRI_CW;
1933 } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
1934 te.OutputTopology = OUTPUT_TRI_CCW;
1935 } else {
1936 te.OutputTopology = tes_prog_data->output_topology;
1937 }
1938 }
1939
1940 te.TEDomain = tes_prog_data->domain;
1941 te.TEEnable = true;
1942 te.MaximumTessellationFactorOdd = 63.0;
1943 te.MaximumTessellationFactorNotOdd = 64.0;
1944 }
1945
1946 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {
1947 ds.Enable = true;
1948 ds.StatisticsEnable = true;
1949 ds.KernelStartPointer = tes_bin->kernel.offset;
1950 /* Wa_1606682166 */
1951 ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
1952 ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
1953 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1954
1955 ds.ComputeWCoordinateEnable =
1956 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
1957
1958 ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1959 ds.PatchURBEntryReadOffset = 0;
1960 ds.DispatchGRFStartRegisterForURBData =
1961 tes_prog_data->base.base.dispatch_grf_start_reg;
1962
1963 #if GFX_VER >= 8
1964 #if GFX_VER < 11
1965 ds.DispatchMode =
1966 tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1967 DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1968 DISPATCH_MODE_SIMD4X2;
1969 #else
1970 assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
1971 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
1972 #endif
1973
1974 ds.UserClipDistanceClipTestEnableBitmask =
1975 tes_prog_data->base.clip_distance_mask;
1976 ds.UserClipDistanceCullTestEnableBitmask =
1977 tes_prog_data->base.cull_distance_mask;
1978 #endif
1979
1980 #if GFX_VERx10 >= 125
1981 ds.ScratchSpaceBuffer =
1982 get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
1983 #else
1984 ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1985 ds.ScratchSpaceBasePointer =
1986 get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
1987 #endif
1988 }
1989 }
1990
1991 static void
1992 emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
1993 {
1994 const struct intel_device_info *devinfo = &pipeline->base.device->info;
1995 const struct anv_shader_bin *gs_bin =
1996 pipeline->shaders[MESA_SHADER_GEOMETRY];
1997
1998 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1999 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs);
2000 return;
2001 }
2002
2003 const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
2004
2005 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) {
2006 gs.Enable = true;
2007 gs.StatisticsEnable = true;
2008 gs.KernelStartPointer = gs_bin->kernel.offset;
2009 gs.DispatchMode = gs_prog_data->base.dispatch_mode;
2010
2011 gs.SingleProgramFlow = false;
2012 gs.VectorMaskEnable = false;
2013 /* Wa_1606682166 */
2014 gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
2015 gs.BindingTableEntryCount = gs_bin->bind_map.surface_count;
2016 gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles;
2017 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2018
2019 if (GFX_VER == 8) {
2020 /* Broadwell is weird. It needs us to divide by 2. */
2021 gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;
2022 } else {
2023 gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2024 }
2025
2026 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2027 gs.OutputTopology = gs_prog_data->output_topology;
2028 gs.ControlDataFormat = gs_prog_data->control_data_format;
2029 gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords;
2030 gs.InstanceControl = MAX2(gs_prog_data->invocations, 1) - 1;
2031 gs.ReorderMode = TRAILING;
2032
2033 #if GFX_VER >= 8
2034 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2035 gs.StaticOutput = gs_prog_data->static_vertex_count >= 0;
2036 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
2037 gs_prog_data->static_vertex_count : 0;
2038 #endif
2039
2040 gs.VertexURBEntryReadOffset = 0;
2041 gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
2042 gs.DispatchGRFStartRegisterForURBData =
2043 gs_prog_data->base.base.dispatch_grf_start_reg;
2044
2045 #if GFX_VER >= 8
2046 gs.UserClipDistanceClipTestEnableBitmask =
2047 gs_prog_data->base.clip_distance_mask;
2048 gs.UserClipDistanceCullTestEnableBitmask =
2049 gs_prog_data->base.cull_distance_mask;
2050 #endif
2051
2052 #if GFX_VERx10 >= 125
2053 gs.ScratchSpaceBuffer =
2054 get_scratch_surf(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
2055 #else
2056 gs.PerThreadScratchSpace = get_scratch_space(gs_bin);
2057 gs.ScratchSpaceBasePointer =
2058 get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
2059 #endif
2060 }
2061 }
2062
2063 static bool
2064 has_color_buffer_write_enabled(const struct anv_graphics_pipeline *pipeline,
2065 const VkPipelineColorBlendStateCreateInfo *blend)
2066 {
2067 const struct anv_shader_bin *shader_bin =
2068 pipeline->shaders[MESA_SHADER_FRAGMENT];
2069 if (!shader_bin)
2070 return false;
2071
2072 if (!pipeline->dynamic_state.color_writes)
2073 return false;
2074
2075 const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;
2076 for (int i = 0; i < bind_map->surface_count; i++) {
2077 struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[i];
2078
2079 if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
2080 continue;
2081
2082 if (binding->index == UINT32_MAX)
2083 continue;
2084
2085 if (blend && blend->pAttachments[binding->index].colorWriteMask != 0)
2086 return true;
2087 }
2088
2089 return false;
2090 }
2091
2092 static void
2093 emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, struct anv_subpass *subpass,
2094 const VkPipelineInputAssemblyStateCreateInfo *ia,
2095 const VkPipelineRasterizationStateCreateInfo *raster,
2096 const VkPipelineColorBlendStateCreateInfo *blend,
2097 const VkPipelineMultisampleStateCreateInfo *multisample,
2098 const VkPipelineRasterizationLineStateCreateInfoEXT *line,
2099 const uint32_t dynamic_states)
2100 {
2101 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2102
2103 struct GENX(3DSTATE_WM) wm = {
2104 GENX(3DSTATE_WM_header),
2105 };
2106 wm.StatisticsEnable = true;
2107 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
2108 wm.LineAntialiasingRegionWidth = _10pixels;
2109 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
2110
2111 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2112 if (wm_prog_data->early_fragment_tests) {
2113 wm.EarlyDepthStencilControl = EDSC_PREPS;
2114 } else if (wm_prog_data->has_side_effects) {
2115 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
2116 } else {
2117 wm.EarlyDepthStencilControl = EDSC_NORMAL;
2118 }
2119
2120 #if GFX_VER >= 8
2121 /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
2122 * doesn't take into account KillPixels when no depth or stencil
2123 * writes are enabled. In order for occlusion queries to work
2124 * correctly with no attachments, we need to force-enable PS thread
2125 * dispatch.
2126 *
2127 * The BDW docs are pretty clear that that this bit isn't validated
2128 * and probably shouldn't be used in production:
2129 *
2130 * "This must always be set to Normal. This field should not be
2131 * tested for functional validation."
2132 *
2133 * Unfortunately, however, the other mechanism we have for doing this
2134 * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
2135 * Given two bad options, we choose the one which works.
2136 */
2137 pipeline->force_fragment_thread_dispatch =
2138 wm_prog_data->has_side_effects ||
2139 wm_prog_data->uses_kill;
2140
2141 if (pipeline->force_fragment_thread_dispatch ||
2142 !has_color_buffer_write_enabled(pipeline, blend)) {
2143 /* Only set this value in non dynamic mode. */
2144 wm.ForceThreadDispatchEnable =
2145 !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) ? ForceON : 0;
2146 }
2147 #endif
2148
2149 wm.BarycentricInterpolationMode =
2150 wm_prog_data->barycentric_interp_modes;
2151
2152 #if GFX_VER < 8
2153 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
2154 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
2155 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
2156 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2157
2158 /* If the subpass has a depth or stencil self-dependency, then we
2159 * need to force the hardware to do the depth/stencil write *after*
2160 * fragment shader execution. Otherwise, the writes may hit memory
2161 * before we get around to fetching from the input attachment and we
2162 * may get the depth or stencil value from the current draw rather
2163 * than the previous one.
2164 */
2165 wm.PixelShaderKillsPixel = subpass->has_ds_self_dep ||
2166 wm_prog_data->uses_kill;
2167
2168 pipeline->force_fragment_thread_dispatch =
2169 wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
2170 wm_prog_data->has_side_effects ||
2171 wm.PixelShaderKillsPixel;
2172
2173 if (pipeline->force_fragment_thread_dispatch ||
2174 has_color_buffer_write_enabled(pipeline, blend)) {
2175 /* Only set this value in non dynamic mode. */
2176 wm.ThreadDispatchEnable = !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);
2177 }
2178
2179 if (multisample && multisample->rasterizationSamples > 1) {
2180 if (wm_prog_data->persample_dispatch) {
2181 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2182 } else {
2183 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
2184 }
2185 } else {
2186 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2187 }
2188
2189 VkPolygonMode raster_mode =
2190 genX(raster_polygon_mode)(pipeline, ia->topology);
2191
2192 wm.MultisampleRasterizationMode =
2193 dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ? 0 :
2194 genX(ms_rasterization_mode)(pipeline, raster_mode);
2195 #endif
2196
2197 wm.LineStippleEnable = line && line->stippledLineEnable;
2198 }
2199
2200 uint32_t dynamic_wm_states = ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
2201
2202 #if GFX_VER < 8
2203 dynamic_wm_states |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
2204 #endif
2205
2206 if (dynamic_states & dynamic_wm_states) {
2207 const struct intel_device_info *devinfo = &pipeline->base.device->info;
2208 uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm;
2209 GENX(3DSTATE_WM_pack)(NULL, dws, &wm);
2210 } else {
2211 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), _wm)
2212 _wm = wm;
2213 }
2214 }
2215
2216 static void
2217 emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
2218 const VkPipelineColorBlendStateCreateInfo *blend,
2219 const VkPipelineMultisampleStateCreateInfo *multisample)
2220 {
2221 UNUSED const struct intel_device_info *devinfo =
2222 &pipeline->base.device->info;
2223 const struct anv_shader_bin *fs_bin =
2224 pipeline->shaders[MESA_SHADER_FRAGMENT];
2225
2226 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2227 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
2228 #if GFX_VER == 7
2229 /* Even if no fragments are ever dispatched, gfx7 hardware hangs if
2230 * we don't at least set the maximum number of threads.
2231 */
2232 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
2233 #endif
2234 }
2235 return;
2236 }
2237
2238 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2239
2240 #if GFX_VER < 8
2241 /* The hardware wedges if you have this bit set but don't turn on any dual
2242 * source blend factors.
2243 */
2244 bool dual_src_blend = false;
2245 if (wm_prog_data->dual_src_blend && blend) {
2246 for (uint32_t i = 0; i < blend->attachmentCount; i++) {
2247 const VkPipelineColorBlendAttachmentState *bstate =
2248 &blend->pAttachments[i];
2249
2250 if (bstate->blendEnable &&
2251 (is_dual_src_blend_factor(bstate->srcColorBlendFactor) ||
2252 is_dual_src_blend_factor(bstate->dstColorBlendFactor) ||
2253 is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) ||
2254 is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) {
2255 dual_src_blend = true;
2256 break;
2257 }
2258 }
2259 }
2260 #endif
2261
2262 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
2263 ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
2264 ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
2265 ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
2266
2267 /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
2268 *
2269 * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
2270 * Dispatch must not be enabled for PER_PIXEL dispatch mode."
2271 *
2272 * Since 16x MSAA is first introduced on SKL, we don't need to apply
2273 * the workaround on any older hardware.
2274 */
2275 if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch &&
2276 multisample && multisample->rasterizationSamples == 16) {
2277 assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
2278 ps._32PixelDispatchEnable = false;
2279 }
2280
2281 ps.KernelStartPointer0 = fs_bin->kernel.offset +
2282 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
2283 ps.KernelStartPointer1 = fs_bin->kernel.offset +
2284 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
2285 ps.KernelStartPointer2 = fs_bin->kernel.offset +
2286 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
2287
2288 ps.SingleProgramFlow = false;
2289 ps.VectorMaskEnable = GFX_VER >= 8;
2290 /* Wa_1606682166 */
2291 ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
2292 ps.BindingTableEntryCount = fs_bin->bind_map.surface_count;
2293 ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 ||
2294 wm_prog_data->base.ubo_ranges[0].length;
2295 ps.PositionXYOffsetSelect = wm_prog_data->uses_pos_offset ?
2296 POSOFFSET_SAMPLE: POSOFFSET_NONE;
2297 #if GFX_VER < 8
2298 ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;
2299 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
2300 ps.DualSourceBlendEnable = dual_src_blend;
2301 #endif
2302
2303 #if GFX_VERx10 == 75
2304 /* Haswell requires the sample mask to be set in this packet as well
2305 * as in 3DSTATE_SAMPLE_MASK; the values should match.
2306 */
2307 ps.SampleMask = 0xff;
2308 #endif
2309
2310 #if GFX_VER >= 9
2311 ps.MaximumNumberofThreadsPerPSD = 64 - 1;
2312 #elif GFX_VER >= 8
2313 ps.MaximumNumberofThreadsPerPSD = 64 - 2;
2314 #else
2315 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
2316 #endif
2317
2318 ps.DispatchGRFStartRegisterForConstantSetupData0 =
2319 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
2320 ps.DispatchGRFStartRegisterForConstantSetupData1 =
2321 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
2322 ps.DispatchGRFStartRegisterForConstantSetupData2 =
2323 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
2324
2325 #if GFX_VERx10 >= 125
2326 ps.ScratchSpaceBuffer =
2327 get_scratch_surf(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
2328 #else
2329 ps.PerThreadScratchSpace = get_scratch_space(fs_bin);
2330 ps.ScratchSpaceBasePointer =
2331 get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
2332 #endif
2333 }
2334 }
2335
2336 #if GFX_VER >= 8
2337 static void
2338 emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
2339 struct anv_subpass *subpass,
2340 const VkPipelineRasterizationStateCreateInfo *rs_info)
2341 {
2342 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2343
2344 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2345 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps);
2346 return;
2347 }
2348
2349 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) {
2350 ps.PixelShaderValid = true;
2351 ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0;
2352 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
2353 ps.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
2354 ps.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
2355 ps.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
2356 ps.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
2357
2358 /* If the subpass has a depth or stencil self-dependency, then we need
2359 * to force the hardware to do the depth/stencil write *after* fragment
2360 * shader execution. Otherwise, the writes may hit memory before we get
2361 * around to fetching from the input attachment and we may get the depth
2362 * or stencil value from the current draw rather than the previous one.
2363 */
2364 ps.PixelShaderKillsPixel = subpass->has_ds_self_dep ||
2365 wm_prog_data->uses_kill;
2366
2367 #if GFX_VER >= 9
2368 ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
2369 ps.PixelShaderPullsBary = wm_prog_data->pulls_bary;
2370
2371 ps.InputCoverageMaskState = ICMS_NONE;
2372 assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
2373 if (!wm_prog_data->uses_sample_mask)
2374 ps.InputCoverageMaskState = ICMS_NONE;
2375 else if (wm_prog_data->per_coarse_pixel_dispatch)
2376 ps.InputCoverageMaskState = ICMS_NORMAL;
2377 else if (wm_prog_data->post_depth_coverage)
2378 ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
2379 else
2380 ps.InputCoverageMaskState = ICMS_NORMAL;
2381 #else
2382 ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2383 #endif
2384
2385 #if GFX_VER >= 11
2386 ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
2387 wm_prog_data->uses_depth_w_coefficients;
2388 ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;
2389 #endif
2390 }
2391 }
2392
2393 static void
2394 emit_3dstate_vf_topology(struct anv_graphics_pipeline *pipeline)
2395 {
2396 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
2397 vft.PrimitiveTopologyType = pipeline->topology;
2398 }
2399 }
2400 #endif
2401
2402 static void
2403 emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
2404 {
2405 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
2406 vfs.StatisticsEnable = true;
2407 }
2408 }
2409
2410 static void
2411 compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
2412 const VkPipelineMultisampleStateCreateInfo *ms_info,
2413 const struct anv_subpass *subpass)
2414 {
2415 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2416 pipeline->kill_pixel = false;
2417 return;
2418 }
2419
2420 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2421
2422 /* This computes the KillPixel portion of the computation for whether or
2423 * not we want to enable the PMA fix on gfx8 or gfx9. It's given by this
2424 * chunk of the giant formula:
2425 *
2426 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2427 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2428 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2429 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
2430 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2431 *
2432 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
2433 * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
2434 * of an alpha test.
2435 */
2436 pipeline->kill_pixel =
2437 subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
2438 wm_prog_data->uses_omask ||
2439 (ms_info && ms_info->alphaToCoverageEnable);
2440 }
2441
2442 #if GFX_VER == 12
2443 static void
2444 emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)
2445 {
2446 if (!pipeline->use_primitive_replication) {
2447 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
2448 return;
2449 }
2450
2451 uint32_t view_mask = pipeline->subpass->view_mask;
2452 int view_count = util_bitcount(view_mask);
2453 assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
2454
2455 anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
2456 pr.ReplicaMask = (1 << view_count) - 1;
2457 pr.ReplicationCount = view_count - 1;
2458
2459 int i = 0;
2460 u_foreach_bit(view_index, view_mask) {
2461 pr.RTAIOffset[i] = view_index;
2462 i++;
2463 }
2464 }
2465 }
2466 #endif
2467
2468 static VkResult
2469 genX(graphics_pipeline_create)(
2470 VkDevice _device,
2471 struct anv_pipeline_cache * cache,
2472 const VkGraphicsPipelineCreateInfo* pCreateInfo,
2473 const VkAllocationCallbacks* pAllocator,
2474 VkPipeline* pPipeline)
2475 {
2476 ANV_FROM_HANDLE(anv_device, device, _device);
2477 ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
2478 struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
2479 struct anv_graphics_pipeline *pipeline;
2480 VkResult result;
2481
2482 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
2483
2484 /* Use the default pipeline cache if none is specified */
2485 if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2486 cache = &device->default_pipeline_cache;
2487
2488 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
2489 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2490 if (pipeline == NULL)
2491 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2492
2493 result = anv_graphics_pipeline_init(pipeline, device, cache,
2494 pCreateInfo, pAllocator);
2495 if (result != VK_SUCCESS) {
2496 vk_free2(&device->vk.alloc, pAllocator, pipeline);
2497 if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
2498 *pPipeline = VK_NULL_HANDLE;
2499 return result;
2500 }
2501
2502 /* Information on which states are considered dynamic. */
2503 const VkPipelineDynamicStateCreateInfo *dyn_info =
2504 pCreateInfo->pDynamicState;
2505 uint32_t dynamic_states = 0;
2506 if (dyn_info) {
2507 for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)
2508 dynamic_states |=
2509 anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);
2510 }
2511
2512
2513 /* If rasterization is not enabled, various CreateInfo structs must be
2514 * ignored.
2515 */
2516 const bool raster_enabled =
2517 !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
2518 (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
2519
2520 const VkPipelineViewportStateCreateInfo *vp_info =
2521 raster_enabled ? pCreateInfo->pViewportState : NULL;
2522
2523 const VkPipelineMultisampleStateCreateInfo *ms_info =
2524 raster_enabled ? pCreateInfo->pMultisampleState : NULL;
2525
2526 const VkPipelineDepthStencilStateCreateInfo *ds_info =
2527 raster_enabled ? pCreateInfo->pDepthStencilState : NULL;
2528
2529 const VkPipelineColorBlendStateCreateInfo *cb_info =
2530 raster_enabled ? pCreateInfo->pColorBlendState : NULL;
2531
2532 const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
2533 vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
2534 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
2535
2536 enum intel_urb_deref_block_size urb_deref_block_size;
2537 emit_urb_setup(pipeline, &urb_deref_block_size);
2538
2539 assert(pCreateInfo->pRasterizationState);
2540 emit_rs_state(pipeline, pCreateInfo->pInputAssemblyState,
2541 pCreateInfo->pRasterizationState,
2542 ms_info, line_info, dynamic_states, pass, subpass,
2543 urb_deref_block_size);
2544 emit_ms_state(pipeline, ms_info, dynamic_states);
2545 emit_ds_state(pipeline, ds_info, dynamic_states, pass, subpass);
2546 emit_cb_state(pipeline, cb_info, ms_info, dynamic_states);
2547 compute_kill_pixel(pipeline, ms_info, subpass);
2548
2549 emit_3dstate_clip(pipeline,
2550 pCreateInfo->pInputAssemblyState,
2551 vp_info,
2552 pCreateInfo->pRasterizationState,
2553 dynamic_states);
2554
2555 #if GFX_VER == 12
2556 emit_3dstate_primitive_replication(pipeline);
2557 #endif
2558
2559 #if 0
2560 /* From gfx7_vs_state.c */
2561
2562 /**
2563 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2564 * Geometry > Geometry Shader > State:
2565 *
2566 * "Note: Because of corruption in IVB:GT2, software needs to flush the
2567 * whole fixed function pipeline when the GS enable changes value in
2568 * the 3DSTATE_GS."
2569 *
2570 * The hardware architects have clarified that in this context "flush the
2571 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2572 * Stall" bit set.
2573 */
2574 if (!device->info.is_haswell && !device->info.is_baytrail)
2575 gfx7_emit_vs_workaround_flush(brw);
2576 #endif
2577
2578 if (anv_pipeline_is_primitive(pipeline)) {
2579 assert(pCreateInfo->pVertexInputState);
2580 emit_vertex_input(pipeline, pCreateInfo->pVertexInputState);
2581
2582 emit_3dstate_vs(pipeline);
2583 emit_3dstate_hs_te_ds(pipeline, pCreateInfo->pTessellationState);
2584 emit_3dstate_gs(pipeline);
2585
2586 #if GFX_VER >= 8
2587 if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))
2588 emit_3dstate_vf_topology(pipeline);
2589 #endif
2590
2591 emit_3dstate_vf_statistics(pipeline);
2592
2593 emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState,
2594 dynamic_states);
2595 }
2596
2597 emit_3dstate_sbe(pipeline);
2598 emit_3dstate_wm(pipeline, subpass,
2599 pCreateInfo->pInputAssemblyState,
2600 pCreateInfo->pRasterizationState,
2601 cb_info, ms_info, line_info, dynamic_states);
2602 emit_3dstate_ps(pipeline, cb_info, ms_info);
2603 #if GFX_VER >= 8
2604 emit_3dstate_ps_extra(pipeline, subpass,
2605 pCreateInfo->pRasterizationState);
2606 #endif
2607
2608 *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2609
2610 return pipeline->base.batch.status;
2611 }
2612
2613 #if GFX_VERx10 >= 125
2614
2615 static void
2616 emit_compute_state(struct anv_compute_pipeline *pipeline,
2617 const struct anv_device *device)
2618 {
2619 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2620 anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2621
2622 const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
2623 const struct intel_device_info *devinfo = &device->info;
2624
2625 anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
2626 cfe.MaximumNumberofThreads =
2627 devinfo->max_cs_threads * devinfo->subslice_total - 1;
2628 cfe.ScratchSpaceBuffer =
2629 get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2630 }
2631 }
2632
2633 #else /* #if GFX_VERx10 >= 125 */
2634
2635 static void
2636 emit_compute_state(struct anv_compute_pipeline *pipeline,
2637 const struct anv_device *device)
2638 {
2639 const struct intel_device_info *devinfo = &device->info;
2640 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2641
2642 anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2643
2644 const struct brw_cs_dispatch_info dispatch =
2645 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
2646 const uint32_t vfe_curbe_allocation =
2647 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
2648 cs_prog_data->push.cross_thread.regs, 2);
2649
2650 const struct anv_shader_bin *cs_bin = pipeline->cs;
2651
2652 anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
2653 #if GFX_VER > 7
2654 vfe.StackSize = 0;
2655 #else
2656 vfe.GPGPUMode = true;
2657 #endif
2658 vfe.MaximumNumberofThreads =
2659 devinfo->max_cs_threads * devinfo->subslice_total - 1;
2660 vfe.NumberofURBEntries = GFX_VER <= 7 ? 0 : 2;
2661 #if GFX_VER < 11
2662 vfe.ResetGatewayTimer = true;
2663 #endif
2664 #if GFX_VER <= 8
2665 vfe.BypassGatewayControl = true;
2666 #endif
2667 vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2;
2668 vfe.CURBEAllocationSize = vfe_curbe_allocation;
2669
2670 if (cs_bin->prog_data->total_scratch) {
2671 if (GFX_VER >= 8) {
2672 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
2673 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
2674 */
2675 vfe.PerThreadScratchSpace =
2676 ffs(cs_bin->prog_data->total_scratch) - 11;
2677 } else if (GFX_VERx10 == 75) {
2678 /* Haswell's Per Thread Scratch Space is in the range [0, 10]
2679 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
2680 */
2681 vfe.PerThreadScratchSpace =
2682 ffs(cs_bin->prog_data->total_scratch) - 12;
2683 } else {
2684 /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
2685 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
2686 */
2687 vfe.PerThreadScratchSpace =
2688 cs_bin->prog_data->total_scratch / 1024 - 1;
2689 }
2690 vfe.ScratchSpaceBasePointer =
2691 get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2692 }
2693 }
2694
2695 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
2696 .KernelStartPointer =
2697 cs_bin->kernel.offset +
2698 brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
2699
2700 /* Wa_1606682166 */
2701 .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
2702 /* We add 1 because the CS indirect parameters buffer isn't accounted
2703 * for in bind_map.surface_count.
2704 */
2705 .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
2706 .BarrierEnable = cs_prog_data->uses_barrier,
2707 .SharedLocalMemorySize =
2708 encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
2709
2710 #if GFX_VERx10 != 75
2711 .ConstantURBEntryReadOffset = 0,
2712 #endif
2713 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2714 #if GFX_VERx10 >= 75
2715 .CrossThreadConstantDataReadLength =
2716 cs_prog_data->push.cross_thread.regs,
2717 #endif
2718 #if GFX_VER >= 12
2719 /* TODO: Check if we are missing workarounds and enable mid-thread
2720 * preemption.
2721 *
2722 * We still have issues with mid-thread preemption (it was already
2723 * disabled by the kernel on gfx11, due to missing workarounds). It's
2724 * possible that we are just missing some workarounds, and could enable
2725 * it later, but for now let's disable it to fix a GPU in compute in Car
2726 * Chase (and possibly more).
2727 */
2728 .ThreadPreemptionDisable = true,
2729 #endif
2730
2731 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2732 };
2733 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
2734 pipeline->interface_descriptor_data,
2735 &desc);
2736 }
2737
2738 #endif /* #if GFX_VERx10 >= 125 */
2739
2740 static VkResult
2741 compute_pipeline_create(
2742 VkDevice _device,
2743 struct anv_pipeline_cache * cache,
2744 const VkComputePipelineCreateInfo* pCreateInfo,
2745 const VkAllocationCallbacks* pAllocator,
2746 VkPipeline* pPipeline)
2747 {
2748 ANV_FROM_HANDLE(anv_device, device, _device);
2749 struct anv_compute_pipeline *pipeline;
2750 VkResult result;
2751
2752 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
2753
2754 /* Use the default pipeline cache if none is specified */
2755 if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2756 cache = &device->default_pipeline_cache;
2757
2758 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
2759 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2760 if (pipeline == NULL)
2761 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2762
2763 result = anv_pipeline_init(&pipeline->base, device,
2764 ANV_PIPELINE_COMPUTE, pCreateInfo->flags,
2765 pAllocator);
2766 if (result != VK_SUCCESS) {
2767 vk_free2(&device->vk.alloc, pAllocator, pipeline);
2768 return result;
2769 }
2770
2771 anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
2772 pipeline->batch_data, sizeof(pipeline->batch_data));
2773
2774 assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
2775 VK_FROM_HANDLE(vk_shader_module, module, pCreateInfo->stage.module);
2776 result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
2777 pCreateInfo->stage.pName,
2778 pCreateInfo->stage.pSpecializationInfo);
2779 if (result != VK_SUCCESS) {
2780 anv_pipeline_finish(&pipeline->base, device, pAllocator);
2781 vk_free2(&device->vk.alloc, pAllocator, pipeline);
2782 if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
2783 *pPipeline = VK_NULL_HANDLE;
2784 return result;
2785 }
2786
2787 emit_compute_state(pipeline, device);
2788
2789 *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2790
2791 return pipeline->base.batch.status;
2792 }
2793
2794 VkResult genX(CreateGraphicsPipelines)(
2795 VkDevice _device,
2796 VkPipelineCache pipelineCache,
2797 uint32_t count,
2798 const VkGraphicsPipelineCreateInfo* pCreateInfos,
2799 const VkAllocationCallbacks* pAllocator,
2800 VkPipeline* pPipelines)
2801 {
2802 ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2803
2804 VkResult result = VK_SUCCESS;
2805
2806 unsigned i;
2807 for (i = 0; i < count; i++) {
2808 VkResult res = genX(graphics_pipeline_create)(_device,
2809 pipeline_cache,
2810 &pCreateInfos[i],
2811 pAllocator, &pPipelines[i]);
2812
2813 if (res == VK_SUCCESS)
2814 continue;
2815
2816 /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
2817 * is not obvious what error should be report upon 2 different failures.
2818 * */
2819 result = res;
2820 if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
2821 break;
2822
2823 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
2824 break;
2825 }
2826
2827 for (; i < count; i++)
2828 pPipelines[i] = VK_NULL_HANDLE;
2829
2830 return result;
2831 }
2832
2833 VkResult genX(CreateComputePipelines)(
2834 VkDevice _device,
2835 VkPipelineCache pipelineCache,
2836 uint32_t count,
2837 const VkComputePipelineCreateInfo* pCreateInfos,
2838 const VkAllocationCallbacks* pAllocator,
2839 VkPipeline* pPipelines)
2840 {
2841 ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2842
2843 VkResult result = VK_SUCCESS;
2844
2845 unsigned i;
2846 for (i = 0; i < count; i++) {
2847 VkResult res = compute_pipeline_create(_device, pipeline_cache,
2848 &pCreateInfos[i],
2849 pAllocator, &pPipelines[i]);
2850
2851 if (res == VK_SUCCESS)
2852 continue;
2853
2854 /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
2855 * is not obvious what error should be report upon 2 different failures.
2856 * */
2857 result = res;
2858 if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
2859 break;
2860
2861 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
2862 break;
2863 }
2864
2865 for (; i < count; i++)
2866 pPipelines[i] = VK_NULL_HANDLE;
2867
2868 return result;
2869 }
2870
2871 #if GFX_VERx10 >= 125
2872
2873 static void
2874 assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
2875 uint32_t stage_idx,
2876 VkShaderStageFlags valid_stages)
2877 {
2878 if (stage_idx == VK_SHADER_UNUSED_KHR)
2879 return;
2880
2881 assert(stage_idx <= pCreateInfo->stageCount);
2882 assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);
2883 assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);
2884 }
2885
2886 static VkResult
2887 ray_tracing_pipeline_create(
2888 VkDevice _device,
2889 struct anv_pipeline_cache * cache,
2890 const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
2891 const VkAllocationCallbacks* pAllocator,
2892 VkPipeline* pPipeline)
2893 {
2894 ANV_FROM_HANDLE(anv_device, device, _device);
2895 VkResult result;
2896
2897 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);
2898
2899 /* Use the default pipeline cache if none is specified */
2900 if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2901 cache = &device->default_pipeline_cache;
2902
2903 VK_MULTIALLOC(ma);
2904 VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);
2905 VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, pCreateInfo->groupCount);
2906 if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
2907 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2908 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2909
2910 result = anv_pipeline_init(&pipeline->base, device,
2911 ANV_PIPELINE_RAY_TRACING, pCreateInfo->flags,
2912 pAllocator);
2913 if (result != VK_SUCCESS) {
2914 vk_free2(&device->vk.alloc, pAllocator, pipeline);
2915 return result;
2916 }
2917
2918 pipeline->group_count = pCreateInfo->groupCount;
2919 pipeline->groups = groups;
2920
2921 ASSERTED const VkShaderStageFlags ray_tracing_stages =
2922 VK_SHADER_STAGE_RAYGEN_BIT_KHR |
2923 VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
2924 VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
2925 VK_SHADER_STAGE_MISS_BIT_KHR |
2926 VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
2927 VK_SHADER_STAGE_CALLABLE_BIT_KHR;
2928
2929 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
2930 assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);
2931
2932 for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {
2933 const VkRayTracingShaderGroupCreateInfoKHR *ginfo =
2934 &pCreateInfo->pGroups[i];
2935 assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,
2936 VK_SHADER_STAGE_RAYGEN_BIT_KHR |
2937 VK_SHADER_STAGE_MISS_BIT_KHR |
2938 VK_SHADER_STAGE_CALLABLE_BIT_KHR);
2939 assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,
2940 VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);
2941 assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,
2942 VK_SHADER_STAGE_ANY_HIT_BIT_KHR);
2943 assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,
2944 VK_SHADER_STAGE_INTERSECTION_BIT_KHR);
2945 switch (ginfo->type) {
2946 case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
2947 assert(ginfo->generalShader < pCreateInfo->stageCount);
2948 assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);
2949 assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);
2950 assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
2951 break;
2952
2953 case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
2954 assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
2955 assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
2956 break;
2957
2958 case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
2959 assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
2960 break;
2961
2962 default:
2963 unreachable("Invalid ray-tracing shader group type");
2964 }
2965 }
2966
2967 result = anv_ray_tracing_pipeline_init(pipeline, device, cache,
2968 pCreateInfo, pAllocator);
2969 if (result != VK_SUCCESS) {
2970 anv_pipeline_finish(&pipeline->base, device, pAllocator);
2971 vk_free2(&device->vk.alloc, pAllocator, pipeline);
2972 return result;
2973 }
2974
2975 for (uint32_t i = 0; i < pipeline->group_count; i++) {
2976 struct anv_rt_shader_group *group = &pipeline->groups[i];
2977
2978 switch (group->type) {
2979 case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
2980 struct GFX_RT_GENERAL_SBT_HANDLE sh = {};
2981 sh.General = anv_shader_bin_get_bsr(group->general, 32);
2982 GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
2983 break;
2984 }
2985
2986 case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
2987 struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {};
2988 if (group->closest_hit)
2989 sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2990 if (group->any_hit)
2991 sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
2992 GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh);
2993 break;
2994 }
2995
2996 case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
2997 struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {};
2998 if (group->closest_hit)
2999 sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
3000 sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
3001 GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
3002 break;
3003 }
3004
3005 default:
3006 unreachable("Invalid shader group type");
3007 }
3008 }
3009
3010 *pPipeline = anv_pipeline_to_handle(&pipeline->base);
3011
3012 return pipeline->base.batch.status;
3013 }
3014
3015 VkResult
3016 genX(CreateRayTracingPipelinesKHR)(
3017 VkDevice _device,
3018 VkDeferredOperationKHR deferredOperation,
3019 VkPipelineCache pipelineCache,
3020 uint32_t createInfoCount,
3021 const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
3022 const VkAllocationCallbacks* pAllocator,
3023 VkPipeline* pPipelines)
3024 {
3025 ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
3026
3027 VkResult result = VK_SUCCESS;
3028
3029 unsigned i;
3030 for (i = 0; i < createInfoCount; i++) {
3031 VkResult res = ray_tracing_pipeline_create(_device, pipeline_cache,
3032 &pCreateInfos[i],
3033 pAllocator, &pPipelines[i]);
3034
3035 if (res == VK_SUCCESS)
3036 continue;
3037
3038 /* Bail out on the first error as it is not obvious what error should be
3039 * report upon 2 different failures. */
3040 result = res;
3041 if (result != VK_PIPELINE_COMPILE_REQUIRED_EXT)
3042 break;
3043
3044 if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
3045 break;
3046 }
3047
3048 for (; i < createInfoCount; i++)
3049 pPipelines[i] = VK_NULL_HANDLE;
3050
3051 return result;
3052 }
3053 #endif /* GFX_VERx10 >= 125 */
3054