• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28 
29 #include "common/gen_l3_config.h"
30 #include "common/gen_sample_positions.h"
31 #include "nir/nir_xfb_info.h"
32 #include "vk_util.h"
33 #include "vk_format_info.h"
34 
35 static uint32_t
vertex_element_comp_control(enum isl_format format,unsigned comp)36 vertex_element_comp_control(enum isl_format format, unsigned comp)
37 {
38    uint8_t bits;
39    switch (comp) {
40    case 0: bits = isl_format_layouts[format].channels.r.bits; break;
41    case 1: bits = isl_format_layouts[format].channels.g.bits; break;
42    case 2: bits = isl_format_layouts[format].channels.b.bits; break;
43    case 3: bits = isl_format_layouts[format].channels.a.bits; break;
44    default: unreachable("Invalid component");
45    }
46 
47    /*
48     * Take in account hardware restrictions when dealing with 64-bit floats.
49     *
50     * From Broadwell spec, command reference structures, page 586:
51     *  "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
52     *   64-bit components are stored * in the URB without any conversion. In
53     *   this case, vertex elements must be written as 128 or 256 bits, with
54     *   VFCOMP_STORE_0 being used to pad the output as required. E.g., if
55     *   R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
56     *   Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
57     *   set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
58     *   Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
59     *   a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
60     *   Component 3 to be specified as VFCOMP_STORE_0 in order to output a
61     *   256-bit vertex element."
62     */
63    if (bits) {
64       return VFCOMP_STORE_SRC;
65    } else if (comp >= 2 &&
66               !isl_format_layouts[format].channels.b.bits &&
67               isl_format_layouts[format].channels.r.type == ISL_RAW) {
68       /* When emitting 64-bit attributes, we need to write either 128 or 256
69        * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
70        * VFCOMP_STORE_0 to pad the written chunk */
71       return VFCOMP_NOSTORE;
72    } else if (comp < 3 ||
73               isl_format_layouts[format].channels.r.type == ISL_RAW) {
74       /* Note we need to pad with value 0, not 1, due hardware restrictions
75        * (see comment above) */
76       return VFCOMP_STORE_0;
77    } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
78             isl_format_layouts[format].channels.r.type == ISL_SINT) {
79       assert(comp == 3);
80       return VFCOMP_STORE_1_INT;
81    } else {
82       assert(comp == 3);
83       return VFCOMP_STORE_1_FP;
84    }
85 }
86 
87 static void
emit_vertex_input(struct anv_graphics_pipeline * pipeline,const VkPipelineVertexInputStateCreateInfo * info)88 emit_vertex_input(struct anv_graphics_pipeline *pipeline,
89                   const VkPipelineVertexInputStateCreateInfo *info)
90 {
91    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
92 
93    /* Pull inputs_read out of the VS prog data */
94    const uint64_t inputs_read = vs_prog_data->inputs_read;
95    const uint64_t double_inputs_read =
96       vs_prog_data->double_inputs_read & inputs_read;
97    assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
98    const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
99    const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
100    const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||
101                                 vs_prog_data->uses_instanceid ||
102                                 vs_prog_data->uses_firstvertex ||
103                                 vs_prog_data->uses_baseinstance;
104 
105    uint32_t elem_count = __builtin_popcount(elements) -
106       __builtin_popcount(elements_double) / 2;
107 
108    const uint32_t total_elems =
109       MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid);
110 
111    uint32_t *p;
112 
113    const uint32_t num_dwords = 1 + total_elems * 2;
114    p = anv_batch_emitn(&pipeline->base.batch, num_dwords,
115                        GENX(3DSTATE_VERTEX_ELEMENTS));
116    if (!p)
117       return;
118 
119    for (uint32_t i = 0; i < total_elems; i++) {
120       /* The SKL docs for VERTEX_ELEMENT_STATE say:
121        *
122        *    "All elements must be valid from Element[0] to the last valid
123        *    element. (I.e. if Element[2] is valid then Element[1] and
124        *    Element[0] must also be valid)."
125        *
126        * The SKL docs for 3D_Vertex_Component_Control say:
127        *
128        *    "Don't store this component. (Not valid for Component 0, but can
129        *    be used for Component 1-3)."
130        *
131        * So we can't just leave a vertex element blank and hope for the best.
132        * We have to tell the VF hardware to put something in it; so we just
133        * store a bunch of zero.
134        *
135        * TODO: Compact vertex elements so we never end up with holes.
136        */
137       struct GENX(VERTEX_ELEMENT_STATE) element = {
138          .Valid = true,
139          .Component0Control = VFCOMP_STORE_0,
140          .Component1Control = VFCOMP_STORE_0,
141          .Component2Control = VFCOMP_STORE_0,
142          .Component3Control = VFCOMP_STORE_0,
143       };
144       GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element);
145    }
146 
147    for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
148       const VkVertexInputAttributeDescription *desc =
149          &info->pVertexAttributeDescriptions[i];
150       enum isl_format format = anv_get_isl_format(&pipeline->base.device->info,
151                                                   desc->format,
152                                                   VK_IMAGE_ASPECT_COLOR_BIT,
153                                                   VK_IMAGE_TILING_LINEAR);
154 
155       assert(desc->binding < MAX_VBS);
156 
157       if ((elements & (1 << desc->location)) == 0)
158          continue; /* Binding unused */
159 
160       uint32_t slot =
161          __builtin_popcount(elements & ((1 << desc->location) - 1)) -
162          DIV_ROUND_UP(__builtin_popcount(elements_double &
163                                         ((1 << desc->location) -1)), 2);
164 
165       struct GENX(VERTEX_ELEMENT_STATE) element = {
166          .VertexBufferIndex = desc->binding,
167          .Valid = true,
168          .SourceElementFormat = format,
169          .EdgeFlagEnable = false,
170          .SourceElementOffset = desc->offset,
171          .Component0Control = vertex_element_comp_control(format, 0),
172          .Component1Control = vertex_element_comp_control(format, 1),
173          .Component2Control = vertex_element_comp_control(format, 2),
174          .Component3Control = vertex_element_comp_control(format, 3),
175       };
176       GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
177 
178 #if GEN_GEN >= 8
179       /* On Broadwell and later, we have a separate VF_INSTANCING packet
180        * that controls instancing.  On Haswell and prior, that's part of
181        * VERTEX_BUFFER_STATE which we emit later.
182        */
183       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
184          vfi.InstancingEnable = pipeline->vb[desc->binding].instanced;
185          vfi.VertexElementIndex = slot;
186          vfi.InstanceDataStepRate =
187             pipeline->vb[desc->binding].instance_divisor;
188       }
189 #endif
190    }
191 
192    const uint32_t id_slot = elem_count;
193    if (needs_svgs_elem) {
194       /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
195        *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
196        *    Control field is set to something other than VFCOMP_STORE_SRC,
197        *    no higher-numbered Component Control fields may be set to
198        *    VFCOMP_STORE_SRC"
199        *
200        * This means, that if we have BaseInstance, we need BaseVertex as
201        * well.  Just do all or nothing.
202        */
203       uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
204                             vs_prog_data->uses_baseinstance) ?
205                            VFCOMP_STORE_SRC : VFCOMP_STORE_0;
206 
207       struct GENX(VERTEX_ELEMENT_STATE) element = {
208          .VertexBufferIndex = ANV_SVGS_VB_INDEX,
209          .Valid = true,
210          .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
211          .Component0Control = base_ctrl,
212          .Component1Control = base_ctrl,
213 #if GEN_GEN >= 8
214          .Component2Control = VFCOMP_STORE_0,
215          .Component3Control = VFCOMP_STORE_0,
216 #else
217          .Component2Control = VFCOMP_STORE_VID,
218          .Component3Control = VFCOMP_STORE_IID,
219 #endif
220       };
221       GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);
222 
223 #if GEN_GEN >= 8
224       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
225          vfi.VertexElementIndex = id_slot;
226       }
227 #endif
228    }
229 
230 #if GEN_GEN >= 8
231    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) {
232       sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
233       sgvs.VertexIDComponentNumber     = 2;
234       sgvs.VertexIDElementOffset       = id_slot;
235       sgvs.InstanceIDEnable            = vs_prog_data->uses_instanceid;
236       sgvs.InstanceIDComponentNumber   = 3;
237       sgvs.InstanceIDElementOffset     = id_slot;
238    }
239 #endif
240 
241    const uint32_t drawid_slot = elem_count + needs_svgs_elem;
242    if (vs_prog_data->uses_drawid) {
243       struct GENX(VERTEX_ELEMENT_STATE) element = {
244          .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
245          .Valid = true,
246          .SourceElementFormat = ISL_FORMAT_R32_UINT,
247          .Component0Control = VFCOMP_STORE_SRC,
248          .Component1Control = VFCOMP_STORE_0,
249          .Component2Control = VFCOMP_STORE_0,
250          .Component3Control = VFCOMP_STORE_0,
251       };
252       GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
253                                       &p[1 + drawid_slot * 2],
254                                       &element);
255 
256 #if GEN_GEN >= 8
257       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
258          vfi.VertexElementIndex = drawid_slot;
259       }
260 #endif
261    }
262 }
263 
264 void
genX(emit_urb_setup)265 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
266                      const struct gen_l3_config *l3_config,
267                      VkShaderStageFlags active_stages,
268                      const unsigned entry_size[4],
269                      enum gen_urb_deref_block_size *deref_block_size)
270 {
271    const struct gen_device_info *devinfo = &device->info;
272 
273    unsigned entries[4];
274    unsigned start[4];
275    gen_get_urb_config(devinfo, l3_config,
276                       active_stages &
277                          VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
278                       active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
279                       entry_size, entries, start, deref_block_size);
280 
281 #if GEN_GEN == 7 && !GEN_IS_HASWELL
282    /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
283     *
284     *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
285     *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
286     *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
287     *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
288     *    needs to be sent before any combination of VS associated 3DSTATE."
289     */
290    anv_batch_emit(batch, GEN7_PIPE_CONTROL, pc) {
291       pc.DepthStallEnable  = true;
292       pc.PostSyncOperation = WriteImmediateData;
293       pc.Address           = device->workaround_address;
294    }
295 #endif
296 
297    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
298       anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
299          urb._3DCommandSubOpcode      += i;
300          urb.VSURBStartingAddress      = start[i];
301          urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
302          urb.VSNumberofURBEntries      = entries[i];
303       }
304    }
305 }
306 
307 static void
emit_urb_setup(struct anv_graphics_pipeline * pipeline,enum gen_urb_deref_block_size * deref_block_size)308 emit_urb_setup(struct anv_graphics_pipeline *pipeline,
309                enum gen_urb_deref_block_size *deref_block_size)
310 {
311    unsigned entry_size[4];
312    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
313       const struct brw_vue_prog_data *prog_data =
314          !anv_pipeline_has_stage(pipeline, i) ? NULL :
315          (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;
316 
317       entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
318    }
319 
320    genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch,
321                         pipeline->base.l3_config,
322                         pipeline->active_stages, entry_size,
323                         deref_block_size);
324 }
325 
326 static void
emit_3dstate_sbe(struct anv_graphics_pipeline * pipeline)327 emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
328 {
329    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
330 
331    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
332       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe);
333 #if GEN_GEN >= 8
334       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe);
335 #endif
336       return;
337    }
338 
339    const struct brw_vue_map *fs_input_map =
340       &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
341 
342    struct GENX(3DSTATE_SBE) sbe = {
343       GENX(3DSTATE_SBE_header),
344       .AttributeSwizzleEnable = true,
345       .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
346       .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
347       .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
348    };
349 
350 #if GEN_GEN >= 9
351    for (unsigned i = 0; i < 32; i++)
352       sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
353 #endif
354 
355 #if GEN_GEN >= 8
356    /* On Broadwell, they broke 3DSTATE_SBE into two packets */
357    struct GENX(3DSTATE_SBE_SWIZ) swiz = {
358       GENX(3DSTATE_SBE_SWIZ_header),
359    };
360 #else
361 #  define swiz sbe
362 #endif
363 
364    int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
365                                                         fs_input_map);
366    assert(first_slot % 2 == 0);
367    unsigned urb_entry_read_offset = first_slot / 2;
368    int max_source_attr = 0;
369    for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
370       uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
371       int input_index = wm_prog_data->urb_setup[attr];
372 
373       assert(0 <= input_index);
374 
375       /* gl_Viewport and gl_Layer are stored in the VUE header */
376       if (attr == VARYING_SLOT_VIEWPORT || attr == VARYING_SLOT_LAYER) {
377          continue;
378       }
379 
380       if (attr == VARYING_SLOT_PNTC) {
381          sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
382          continue;
383       }
384 
385       const int slot = fs_input_map->varying_to_slot[attr];
386 
387       if (slot == -1) {
388          /* This attribute does not exist in the VUE--that means that the
389           * vertex shader did not write to it.  It could be that it's a
390           * regular varying read by the fragment shader but not written by
391           * the vertex shader or it's gl_PrimitiveID. In the first case the
392           * value is undefined, in the second it needs to be
393           * gl_PrimitiveID.
394           */
395          swiz.Attribute[input_index].ConstantSource = PRIM_ID;
396          swiz.Attribute[input_index].ComponentOverrideX = true;
397          swiz.Attribute[input_index].ComponentOverrideY = true;
398          swiz.Attribute[input_index].ComponentOverrideZ = true;
399          swiz.Attribute[input_index].ComponentOverrideW = true;
400          continue;
401       }
402 
403       /* We have to subtract two slots to accout for the URB entry output
404        * read offset in the VS and GS stages.
405        */
406       const int source_attr = slot - 2 * urb_entry_read_offset;
407       assert(source_attr >= 0 && source_attr < 32);
408       max_source_attr = MAX2(max_source_attr, source_attr);
409       /* The hardware can only do overrides on 16 overrides at a time, and the
410        * other up to 16 have to be lined up so that the input index = the
411        * output index. We'll need to do some tweaking to make sure that's the
412        * case.
413        */
414       if (input_index < 16)
415          swiz.Attribute[input_index].SourceAttribute = source_attr;
416       else
417          assert(source_attr == input_index);
418    }
419 
420    sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
421    sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
422 #if GEN_GEN >= 8
423    sbe.ForceVertexURBEntryReadOffset = true;
424    sbe.ForceVertexURBEntryReadLength = true;
425 #endif
426 
427    uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch,
428                                         GENX(3DSTATE_SBE_length));
429    if (!dw)
430       return;
431    GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe);
432 
433 #if GEN_GEN >= 8
434    dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length));
435    if (!dw)
436       return;
437    GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz);
438 #endif
439 }
440 
441 static VkLineRasterizationModeEXT
vk_line_rasterization_mode(const VkPipelineRasterizationLineStateCreateInfoEXT * line_info,const VkPipelineMultisampleStateCreateInfo * ms_info)442 vk_line_rasterization_mode(const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
443                            const VkPipelineMultisampleStateCreateInfo *ms_info)
444 {
445    VkLineRasterizationModeEXT line_mode =
446       line_info ? line_info->lineRasterizationMode :
447                   VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
448 
449    if (line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT) {
450       if (ms_info && ms_info->rasterizationSamples > 1) {
451          return VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT;
452       } else {
453          return VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT;
454       }
455    }
456 
457    return line_mode;
458 }
459 
460 /** Returns the final polygon mode for rasterization
461  *
462  * This function takes into account polygon mode, primitive topology and the
463  * different shader stages which might generate their own type of primitives.
464  */
465 static VkPolygonMode
anv_raster_polygon_mode(struct anv_graphics_pipeline * pipeline,const VkPipelineInputAssemblyStateCreateInfo * ia_info,const VkPipelineRasterizationStateCreateInfo * rs_info)466 anv_raster_polygon_mode(struct anv_graphics_pipeline *pipeline,
467                         const VkPipelineInputAssemblyStateCreateInfo *ia_info,
468                         const VkPipelineRasterizationStateCreateInfo *rs_info)
469 {
470    if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
471       switch (get_gs_prog_data(pipeline)->output_topology) {
472       case _3DPRIM_POINTLIST:
473          return VK_POLYGON_MODE_POINT;
474 
475       case _3DPRIM_LINELIST:
476       case _3DPRIM_LINESTRIP:
477       case _3DPRIM_LINELOOP:
478          return VK_POLYGON_MODE_LINE;
479 
480       case _3DPRIM_TRILIST:
481       case _3DPRIM_TRIFAN:
482       case _3DPRIM_TRISTRIP:
483       case _3DPRIM_RECTLIST:
484       case _3DPRIM_QUADLIST:
485       case _3DPRIM_QUADSTRIP:
486       case _3DPRIM_POLYGON:
487          return rs_info->polygonMode;
488       }
489       unreachable("Unsupported GS output topology");
490    } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
491       switch (get_tes_prog_data(pipeline)->output_topology) {
492       case BRW_TESS_OUTPUT_TOPOLOGY_POINT:
493          return VK_POLYGON_MODE_POINT;
494 
495       case BRW_TESS_OUTPUT_TOPOLOGY_LINE:
496          return VK_POLYGON_MODE_LINE;
497 
498       case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW:
499       case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
500          return rs_info->polygonMode;
501       }
502       unreachable("Unsupported TCS output topology");
503    } else {
504       switch (ia_info->topology) {
505       case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
506          return VK_POLYGON_MODE_POINT;
507 
508       case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
509       case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
510       case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
511       case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
512          return VK_POLYGON_MODE_LINE;
513 
514       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
515       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
516       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
517       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
518       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
519          return rs_info->polygonMode;
520 
521       default:
522          unreachable("Unsupported primitive topology");
523       }
524    }
525 }
526 
527 #if GEN_GEN <= 7
528 static uint32_t
gen7_ms_rast_mode(struct anv_graphics_pipeline * pipeline,const VkPipelineInputAssemblyStateCreateInfo * ia_info,const VkPipelineRasterizationStateCreateInfo * rs_info,const VkPipelineMultisampleStateCreateInfo * ms_info)529 gen7_ms_rast_mode(struct anv_graphics_pipeline *pipeline,
530                   const VkPipelineInputAssemblyStateCreateInfo *ia_info,
531                   const VkPipelineRasterizationStateCreateInfo *rs_info,
532                   const VkPipelineMultisampleStateCreateInfo *ms_info)
533 {
534    const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
535       vk_find_struct_const(rs_info->pNext,
536                            PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
537 
538    VkPolygonMode raster_mode =
539       anv_raster_polygon_mode(pipeline, ia_info, rs_info);
540    if (raster_mode == VK_POLYGON_MODE_LINE) {
541       switch (vk_line_rasterization_mode(line_info, ms_info)) {
542       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
543          return MSRASTMODE_ON_PATTERN;
544 
545       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
546       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
547          return MSRASTMODE_OFF_PIXEL;
548 
549       default:
550          unreachable("Unsupported line rasterization mode");
551       }
552    } else {
553       return (ms_info && ms_info->rasterizationSamples > 1) ?
554              MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
555    }
556 }
557 #endif
558 
559 const uint32_t genX(vk_to_gen_cullmode)[] = {
560    [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
561    [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
562    [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
563    [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
564 };
565 
566 const uint32_t genX(vk_to_gen_fillmode)[] = {
567    [VK_POLYGON_MODE_FILL]                    = FILL_MODE_SOLID,
568    [VK_POLYGON_MODE_LINE]                    = FILL_MODE_WIREFRAME,
569    [VK_POLYGON_MODE_POINT]                   = FILL_MODE_POINT,
570 };
571 
572 const uint32_t genX(vk_to_gen_front_face)[] = {
573    [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
574    [VK_FRONT_FACE_CLOCKWISE]                 = 0
575 };
576 
577 static void
emit_rs_state(struct anv_graphics_pipeline * pipeline,const VkPipelineInputAssemblyStateCreateInfo * ia_info,const VkPipelineRasterizationStateCreateInfo * rs_info,const VkPipelineMultisampleStateCreateInfo * ms_info,const VkPipelineRasterizationLineStateCreateInfoEXT * line_info,const uint32_t dynamic_states,const struct anv_render_pass * pass,const struct anv_subpass * subpass,enum gen_urb_deref_block_size urb_deref_block_size)578 emit_rs_state(struct anv_graphics_pipeline *pipeline,
579               const VkPipelineInputAssemblyStateCreateInfo *ia_info,
580               const VkPipelineRasterizationStateCreateInfo *rs_info,
581               const VkPipelineMultisampleStateCreateInfo *ms_info,
582               const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
583               const uint32_t dynamic_states,
584               const struct anv_render_pass *pass,
585               const struct anv_subpass *subpass,
586               enum gen_urb_deref_block_size urb_deref_block_size)
587 {
588    struct GENX(3DSTATE_SF) sf = {
589       GENX(3DSTATE_SF_header),
590    };
591 
592    sf.ViewportTransformEnable = true;
593    sf.StatisticsEnable = true;
594    sf.TriangleStripListProvokingVertexSelect = 0;
595    sf.LineStripListProvokingVertexSelect = 0;
596    sf.TriangleFanProvokingVertexSelect = 1;
597    sf.VertexSubPixelPrecisionSelect = _8Bit;
598    sf.AALineDistanceMode = true;
599 
600 #if GEN_IS_HASWELL
601    sf.LineStippleEnable = line_info && line_info->stippledLineEnable;
602 #endif
603 
604 #if GEN_GEN >= 12
605    sf.DerefBlockSize = urb_deref_block_size;
606 #endif
607 
608    const struct brw_vue_prog_data *last_vue_prog_data =
609       anv_pipeline_get_last_vue_prog_data(pipeline);
610 
611    if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
612       sf.PointWidthSource = Vertex;
613    } else {
614       sf.PointWidthSource = State;
615       sf.PointWidth = 1.0;
616    }
617 
618 #if GEN_GEN >= 8
619    struct GENX(3DSTATE_RASTER) raster = {
620       GENX(3DSTATE_RASTER_header),
621    };
622 #else
623 #  define raster sf
624 #endif
625 
626    VkPolygonMode raster_mode =
627       anv_raster_polygon_mode(pipeline, ia_info, rs_info);
628    VkLineRasterizationModeEXT line_mode =
629       vk_line_rasterization_mode(line_info, ms_info);
630 
631    /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
632     * "Multisample Modes State".
633     */
634 #if GEN_GEN >= 8
635    if (raster_mode == VK_POLYGON_MODE_LINE) {
636       /* Unfortunately, configuring our line rasterization hardware on gen8
637        * and later is rather painful.  Instead of giving us bits to tell the
638        * hardware what line mode to use like we had on gen7, we now have an
639        * arcane combination of API Mode and MSAA enable bits which do things
640        * in a table which are expected to magically put the hardware into the
641        * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
642        * hardware people thought of so nothing works the way you want it to.
643        *
644        * Look at the table titled "Multisample Rasterization Modes" in Vol 7
645        * of the Skylake PRM for more details.
646        */
647       switch (line_mode) {
648       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
649          raster.APIMode = DX100;
650          raster.DXMultisampleRasterizationEnable = true;
651          break;
652 
653       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
654       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
655          raster.APIMode = DX9OGL;
656          raster.DXMultisampleRasterizationEnable = false;
657          break;
658 
659       default:
660          unreachable("Unsupported line rasterization mode");
661       }
662    } else {
663       raster.APIMode = DX100;
664       raster.DXMultisampleRasterizationEnable = true;
665    }
666 
667    /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
668     * computations.  If we ever set this bit to a different value, they will
669     * need to be updated accordingly.
670     */
671    raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
672    raster.ForceMultisampling = false;
673 #else
674    raster.MultisampleRasterizationMode =
675       gen7_ms_rast_mode(pipeline, ia_info, rs_info, ms_info);
676 #endif
677 
678    if (raster_mode == VK_POLYGON_MODE_LINE &&
679        line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT)
680       raster.AntialiasingEnable = true;
681 
682    raster.FrontWinding =
683       dynamic_states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE ?
684          0 : genX(vk_to_gen_front_face)[rs_info->frontFace];
685    raster.CullMode =
686       dynamic_states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE ?
687          0 : genX(vk_to_gen_cullmode)[rs_info->cullMode];
688 
689    raster.FrontFaceFillMode = genX(vk_to_gen_fillmode)[rs_info->polygonMode];
690    raster.BackFaceFillMode = genX(vk_to_gen_fillmode)[rs_info->polygonMode];
691    raster.ScissorRectangleEnable = true;
692 
693 #if GEN_GEN >= 9
694    /* GEN9+ splits ViewportZClipTestEnable into near and far enable bits */
695    raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;
696    raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;
697 #elif GEN_GEN >= 8
698    raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;
699 #endif
700 
701    raster.GlobalDepthOffsetEnableSolid = rs_info->depthBiasEnable;
702    raster.GlobalDepthOffsetEnableWireframe = rs_info->depthBiasEnable;
703    raster.GlobalDepthOffsetEnablePoint = rs_info->depthBiasEnable;
704 
705 #if GEN_GEN == 7
706    /* Gen7 requires that we provide the depth format in 3DSTATE_SF so that it
707     * can get the depth offsets correct.
708     */
709    if (subpass->depth_stencil_attachment) {
710       VkFormat vk_format =
711          pass->attachments[subpass->depth_stencil_attachment->attachment].format;
712       assert(vk_format_is_depth_or_stencil(vk_format));
713       if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {
714          enum isl_format isl_format =
715             anv_get_isl_format(&pipeline->base.device->info, vk_format,
716                                VK_IMAGE_ASPECT_DEPTH_BIT,
717                                VK_IMAGE_TILING_OPTIMAL);
718          sf.DepthBufferSurfaceFormat =
719             isl_format_get_depth_format(isl_format, false);
720       }
721    }
722 #endif
723 
724 #if GEN_GEN >= 8
725    GENX(3DSTATE_SF_pack)(NULL, pipeline->gen8.sf, &sf);
726    GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gen8.raster, &raster);
727 #else
728 #  undef raster
729    GENX(3DSTATE_SF_pack)(NULL, &pipeline->gen7.sf, &sf);
730 #endif
731 }
732 
733 static void
emit_ms_state(struct anv_graphics_pipeline * pipeline,const VkPipelineMultisampleStateCreateInfo * info)734 emit_ms_state(struct anv_graphics_pipeline *pipeline,
735               const VkPipelineMultisampleStateCreateInfo *info)
736 {
737    uint32_t samples = 1;
738    uint32_t log2_samples = 0;
739 
740    /* From the Vulkan 1.0 spec:
741     *    If pSampleMask is NULL, it is treated as if the mask has all bits
742     *    enabled, i.e. no coverage is removed from fragments.
743     *
744     * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
745     */
746 #if GEN_GEN >= 8
747    uint32_t sample_mask = 0xffff;
748 #else
749    uint32_t sample_mask = 0xff;
750 #endif
751 
752    if (info) {
753       samples = info->rasterizationSamples;
754       log2_samples = __builtin_ffs(samples) - 1;
755    }
756 
757    if (info && info->pSampleMask)
758       sample_mask &= info->pSampleMask[0];
759 
760    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MULTISAMPLE), ms) {
761       ms.NumberofMultisamples       = log2_samples;
762 
763       ms.PixelLocation              = CENTER;
764 #if GEN_GEN >= 8
765       /* The PRM says that this bit is valid only for DX9:
766        *
767        *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
768        *    should not have any effect by setting or not setting this bit.
769        */
770       ms.PixelPositionOffsetEnable  = false;
771 #else
772 
773       switch (samples) {
774       case 1:
775          GEN_SAMPLE_POS_1X(ms.Sample);
776          break;
777       case 2:
778          GEN_SAMPLE_POS_2X(ms.Sample);
779          break;
780       case 4:
781          GEN_SAMPLE_POS_4X(ms.Sample);
782          break;
783       case 8:
784          GEN_SAMPLE_POS_8X(ms.Sample);
785          break;
786       default:
787          break;
788       }
789 #endif
790    }
791 
792    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
793       sm.SampleMask = sample_mask;
794    }
795 }
796 
797 static const uint32_t vk_to_gen_logic_op[] = {
798    [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
799    [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
800    [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
801    [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
802    [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
803    [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
804    [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
805    [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
806    [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
807    [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
808    [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
809    [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
810    [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
811    [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
812    [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
813    [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
814 };
815 
816 static const uint32_t vk_to_gen_blend[] = {
817    [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
818    [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
819    [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
820    [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
821    [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
822    [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
823    [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
824    [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
825    [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
826    [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
827    [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
828    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
829    [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
830    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
831    [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
832    [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
833    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
834    [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
835    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
836 };
837 
838 static const uint32_t vk_to_gen_blend_op[] = {
839    [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
840    [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
841    [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
842    [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
843    [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
844 };
845 
846 const uint32_t genX(vk_to_gen_compare_op)[] = {
847    [VK_COMPARE_OP_NEVER]                        = PREFILTEROPNEVER,
848    [VK_COMPARE_OP_LESS]                         = PREFILTEROPLESS,
849    [VK_COMPARE_OP_EQUAL]                        = PREFILTEROPEQUAL,
850    [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROPLEQUAL,
851    [VK_COMPARE_OP_GREATER]                      = PREFILTEROPGREATER,
852    [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROPNOTEQUAL,
853    [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROPGEQUAL,
854    [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROPALWAYS,
855 };
856 
857 const uint32_t genX(vk_to_gen_stencil_op)[] = {
858    [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
859    [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
860    [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
861    [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
862    [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
863    [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
864    [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
865    [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
866 };
867 
868 const uint32_t genX(vk_to_gen_primitive_type)[] = {
869    [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
870    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
871    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
872    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
873    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
874    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
875    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
876    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
877    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
878    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
879 };
880 
881 /* This function sanitizes the VkStencilOpState by looking at the compare ops
882  * and trying to determine whether or not a given stencil op can ever actually
883  * occur.  Stencil ops which can never occur are set to VK_STENCIL_OP_KEEP.
884  * This function returns true if, after sanitation, any of the stencil ops are
885  * set to something other than VK_STENCIL_OP_KEEP.
886  */
887 static bool
sanitize_stencil_face(VkStencilOpState * face,VkCompareOp depthCompareOp)888 sanitize_stencil_face(VkStencilOpState *face,
889                       VkCompareOp depthCompareOp)
890 {
891    /* If compareOp is ALWAYS then the stencil test will never fail and failOp
892     * will never happen.  Set failOp to KEEP in this case.
893     */
894    if (face->compareOp == VK_COMPARE_OP_ALWAYS)
895       face->failOp = VK_STENCIL_OP_KEEP;
896 
897    /* If compareOp is NEVER or depthCompareOp is NEVER then one of the depth
898     * or stencil tests will fail and passOp will never happen.
899     */
900    if (face->compareOp == VK_COMPARE_OP_NEVER ||
901        depthCompareOp == VK_COMPARE_OP_NEVER)
902       face->passOp = VK_STENCIL_OP_KEEP;
903 
904    /* If compareOp is NEVER or depthCompareOp is ALWAYS then either the
905     * stencil test will fail or the depth test will pass.  In either case,
906     * depthFailOp will never happen.
907     */
908    if (face->compareOp == VK_COMPARE_OP_NEVER ||
909        depthCompareOp == VK_COMPARE_OP_ALWAYS)
910       face->depthFailOp = VK_STENCIL_OP_KEEP;
911 
912    return face->failOp != VK_STENCIL_OP_KEEP ||
913           face->depthFailOp != VK_STENCIL_OP_KEEP ||
914           face->passOp != VK_STENCIL_OP_KEEP;
915 }
916 
917 /* Intel hardware is fairly sensitive to whether or not depth/stencil writes
918  * are enabled.  In the presence of discards, it's fairly easy to get into the
919  * non-promoted case which means a fairly big performance hit.  From the Iron
920  * Lake PRM, Vol 2, pt. 1, section 8.4.3.2, "Early Depth Test Cases":
921  *
922  *    "Non-promoted depth (N) is active whenever the depth test can be done
923  *    early but it cannot determine whether or not to write source depth to
924  *    the depth buffer, therefore the depth write must be performed post pixel
925  *    shader. This includes cases where the pixel shader can kill pixels,
926  *    including via sampler chroma key, as well as cases where the alpha test
927  *    function is enabled, which kills pixels based on a programmable alpha
928  *    test. In this case, even if the depth test fails, the pixel cannot be
929  *    killed if a stencil write is indicated. Whether or not the stencil write
930  *    happens depends on whether or not the pixel is killed later. In these
931  *    cases if stencil test fails and stencil writes are off, the pixels can
932  *    also be killed early. If stencil writes are enabled, the pixels must be
933  *    treated as Computed depth (described above)."
934  *
935  * The same thing as mentioned in the stencil case can happen in the depth
936  * case as well if it thinks it writes depth but, thanks to the depth test
937  * being GL_EQUAL, the write doesn't actually matter.  A little extra work
938  * up-front to try and disable depth and stencil writes can make a big
939  * difference.
940  *
941  * Unfortunately, the way depth and stencil testing is specified, there are
942  * many case where, regardless of depth/stencil writes being enabled, nothing
943  * actually gets written due to some other bit of state being set.  This
944  * function attempts to "sanitize" the depth stencil state and disable writes
945  * and sometimes even testing whenever possible.
946  */
947 static void
sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo * state,bool * stencilWriteEnable,VkImageAspectFlags ds_aspects)948 sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo *state,
949                   bool *stencilWriteEnable,
950                   VkImageAspectFlags ds_aspects)
951 {
952    *stencilWriteEnable = state->stencilTestEnable;
953 
954    /* If the depth test is disabled, we won't be writing anything. Make sure we
955     * treat the test as always passing later on as well.
956     *
957     * Also, the Vulkan spec requires that if either depth or stencil is not
958     * present, the pipeline is to act as if the test silently passes. In that
959     * case we won't write either.
960     */
961    if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
962       state->depthWriteEnable = false;
963       state->depthCompareOp = VK_COMPARE_OP_ALWAYS;
964    }
965 
966    if (!(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
967       *stencilWriteEnable = false;
968       state->front.compareOp = VK_COMPARE_OP_ALWAYS;
969       state->back.compareOp = VK_COMPARE_OP_ALWAYS;
970    }
971 
972    /* If the stencil test is enabled and always fails, then we will never get
973     * to the depth test so we can just disable the depth test entirely.
974     */
975    if (state->stencilTestEnable &&
976        state->front.compareOp == VK_COMPARE_OP_NEVER &&
977        state->back.compareOp == VK_COMPARE_OP_NEVER) {
978       state->depthTestEnable = false;
979       state->depthWriteEnable = false;
980    }
981 
982    /* If depthCompareOp is EQUAL then the value we would be writing to the
983     * depth buffer is the same as the value that's already there so there's no
984     * point in writing it.
985     */
986    if (state->depthCompareOp == VK_COMPARE_OP_EQUAL)
987       state->depthWriteEnable = false;
988 
989    /* If the stencil ops are such that we don't actually ever modify the
990     * stencil buffer, we should disable writes.
991     */
992    if (!sanitize_stencil_face(&state->front, state->depthCompareOp) &&
993        !sanitize_stencil_face(&state->back, state->depthCompareOp))
994       *stencilWriteEnable = false;
995 
996    /* If the depth test always passes and we never write out depth, that's the
997     * same as if the depth test is disabled entirely.
998     */
999    if (state->depthCompareOp == VK_COMPARE_OP_ALWAYS &&
1000        !state->depthWriteEnable)
1001       state->depthTestEnable = false;
1002 
1003    /* If the stencil test always passes and we never write out stencil, that's
1004     * the same as if the stencil test is disabled entirely.
1005     */
1006    if (state->front.compareOp == VK_COMPARE_OP_ALWAYS &&
1007        state->back.compareOp == VK_COMPARE_OP_ALWAYS &&
1008        !*stencilWriteEnable)
1009       state->stencilTestEnable = false;
1010 }
1011 
1012 static void
emit_ds_state(struct anv_graphics_pipeline * pipeline,const VkPipelineDepthStencilStateCreateInfo * pCreateInfo,const uint32_t dynamic_states,const struct anv_render_pass * pass,const struct anv_subpass * subpass)1013 emit_ds_state(struct anv_graphics_pipeline *pipeline,
1014               const VkPipelineDepthStencilStateCreateInfo *pCreateInfo,
1015               const uint32_t dynamic_states,
1016               const struct anv_render_pass *pass,
1017               const struct anv_subpass *subpass)
1018 {
1019 #if GEN_GEN == 7
1020 #  define depth_stencil_dw pipeline->gen7.depth_stencil_state
1021 #elif GEN_GEN == 8
1022 #  define depth_stencil_dw pipeline->gen8.wm_depth_stencil
1023 #else
1024 #  define depth_stencil_dw pipeline->gen9.wm_depth_stencil
1025 #endif
1026 
1027    if (pCreateInfo == NULL) {
1028       /* We're going to OR this together with the dynamic state.  We need
1029        * to make sure it's initialized to something useful.
1030        */
1031       pipeline->writes_stencil = false;
1032       pipeline->stencil_test_enable = false;
1033       pipeline->writes_depth = false;
1034       pipeline->depth_test_enable = false;
1035       pipeline->depth_bounds_test_enable = false;
1036       memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
1037       return;
1038    }
1039 
1040    VkImageAspectFlags ds_aspects = 0;
1041    if (subpass->depth_stencil_attachment) {
1042       VkFormat depth_stencil_format =
1043          pass->attachments[subpass->depth_stencil_attachment->attachment].format;
1044       ds_aspects = vk_format_aspects(depth_stencil_format);
1045    }
1046 
1047    VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
1048    sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);
1049    pipeline->stencil_test_enable = info.stencilTestEnable;
1050    pipeline->writes_depth = info.depthWriteEnable;
1051    pipeline->depth_test_enable = info.depthTestEnable;
1052    pipeline->depth_bounds_test_enable = info.depthBoundsTestEnable;
1053 
1054    bool dynamic_stencil_op =
1055       dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1056 
1057 #if GEN_GEN <= 7
1058    struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
1059 #else
1060    struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = {
1061 #endif
1062       .DepthTestEnable =
1063          dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE ?
1064             0 : info.depthTestEnable,
1065 
1066       .DepthBufferWriteEnable =
1067          dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE ?
1068             0 : info.depthWriteEnable,
1069 
1070       .DepthTestFunction =
1071          dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP ?
1072             0 : genX(vk_to_gen_compare_op)[info.depthCompareOp],
1073 
1074       .DoubleSidedStencilEnable = true,
1075 
1076       .StencilTestEnable =
1077          dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE ?
1078             0 : info.stencilTestEnable,
1079 
1080       .StencilFailOp = genX(vk_to_gen_stencil_op)[info.front.failOp],
1081       .StencilPassDepthPassOp = genX(vk_to_gen_stencil_op)[info.front.passOp],
1082       .StencilPassDepthFailOp = genX(vk_to_gen_stencil_op)[info.front.depthFailOp],
1083       .StencilTestFunction = genX(vk_to_gen_compare_op)[info.front.compareOp],
1084       .BackfaceStencilFailOp = genX(vk_to_gen_stencil_op)[info.back.failOp],
1085       .BackfaceStencilPassDepthPassOp = genX(vk_to_gen_stencil_op)[info.back.passOp],
1086       .BackfaceStencilPassDepthFailOp = genX(vk_to_gen_stencil_op)[info.back.depthFailOp],
1087       .BackfaceStencilTestFunction = genX(vk_to_gen_compare_op)[info.back.compareOp],
1088    };
1089 
1090    if (dynamic_stencil_op) {
1091       depth_stencil.StencilFailOp = 0;
1092       depth_stencil.StencilPassDepthPassOp = 0;
1093       depth_stencil.StencilPassDepthFailOp = 0;
1094       depth_stencil.StencilTestFunction = 0;
1095       depth_stencil.BackfaceStencilFailOp = 0;
1096       depth_stencil.BackfaceStencilPassDepthPassOp = 0;
1097       depth_stencil.BackfaceStencilPassDepthFailOp = 0;
1098       depth_stencil.BackfaceStencilTestFunction = 0;
1099    }
1100 
1101 #if GEN_GEN <= 7
1102    GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
1103 #else
1104    GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);
1105 #endif
1106 }
1107 
1108 static bool
1109 is_dual_src_blend_factor(VkBlendFactor factor)
1110 {
1111    return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
1112           factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
1113           factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
1114           factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
1115 }
1116 
1117 static void
1118 emit_cb_state(struct anv_graphics_pipeline *pipeline,
1119               const VkPipelineColorBlendStateCreateInfo *info,
1120               const VkPipelineMultisampleStateCreateInfo *ms_info)
1121 {
1122    struct anv_device *device = pipeline->base.device;
1123    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1124 
1125    struct GENX(BLEND_STATE) blend_state = {
1126 #if GEN_GEN >= 8
1127       .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
1128       .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
1129 #endif
1130    };
1131 
1132    uint32_t surface_count = 0;
1133    struct anv_pipeline_bind_map *map;
1134    if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1135       map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
1136       surface_count = map->surface_count;
1137    }
1138 
1139    const uint32_t num_dwords = GENX(BLEND_STATE_length) +
1140       GENX(BLEND_STATE_ENTRY_length) * surface_count;
1141    pipeline->blend_state =
1142       anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
1143 
1144    bool has_writeable_rt = false;
1145    uint32_t *state_pos = pipeline->blend_state.map;
1146    state_pos += GENX(BLEND_STATE_length);
1147 #if GEN_GEN >= 8
1148    struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
1149 #endif
1150    for (unsigned i = 0; i < surface_count; i++) {
1151       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
1152 
1153       /* All color attachments are at the beginning of the binding table */
1154       if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
1155          break;
1156 
1157       /* We can have at most 8 attachments */
1158       assert(i < 8);
1159 
1160       if (info == NULL || binding->index >= info->attachmentCount) {
1161          /* Default everything to disabled */
1162          struct GENX(BLEND_STATE_ENTRY) entry = {
1163             .WriteDisableAlpha = true,
1164             .WriteDisableRed = true,
1165             .WriteDisableGreen = true,
1166             .WriteDisableBlue = true,
1167          };
1168          GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
1169          state_pos += GENX(BLEND_STATE_ENTRY_length);
1170          continue;
1171       }
1172 
1173       const VkPipelineColorBlendAttachmentState *a =
1174          &info->pAttachments[binding->index];
1175 
1176       struct GENX(BLEND_STATE_ENTRY) entry = {
1177 #if GEN_GEN < 8
1178          .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
1179          .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
1180 #endif
1181          .LogicOpEnable = info->logicOpEnable,
1182          .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
1183          /* Vulkan specification 1.2.168, VkLogicOp:
1184           *
1185           *   "Logical operations are controlled by the logicOpEnable and
1186           *    logicOp members of VkPipelineColorBlendStateCreateInfo. If
1187           *    logicOpEnable is VK_TRUE, then a logical operation selected by
1188           *    logicOp is applied between each color attachment and the
1189           *    fragment’s corresponding output value, and blending of all
1190           *    attachments is treated as if it were disabled."
1191           *
1192           * From the Broadwell PRM Volume 2d: Command Reference: Structures:
1193           * BLEND_STATE_ENTRY:
1194           *
1195           *   "Enabling LogicOp and Color Buffer Blending at the same time is
1196           *    UNDEFINED"
1197           */
1198          .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,
1199          .ColorClampRange = COLORCLAMP_RTFORMAT,
1200          .PreBlendColorClampEnable = true,
1201          .PostBlendColorClampEnable = true,
1202          .SourceBlendFactor = vk_to_gen_blend[a->srcColorBlendFactor],
1203          .DestinationBlendFactor = vk_to_gen_blend[a->dstColorBlendFactor],
1204          .ColorBlendFunction = vk_to_gen_blend_op[a->colorBlendOp],
1205          .SourceAlphaBlendFactor = vk_to_gen_blend[a->srcAlphaBlendFactor],
1206          .DestinationAlphaBlendFactor = vk_to_gen_blend[a->dstAlphaBlendFactor],
1207          .AlphaBlendFunction = vk_to_gen_blend_op[a->alphaBlendOp],
1208          .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
1209          .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
1210          .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
1211          .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
1212       };
1213 
1214       if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
1215           a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
1216           a->colorBlendOp != a->alphaBlendOp) {
1217 #if GEN_GEN >= 8
1218          blend_state.IndependentAlphaBlendEnable = true;
1219 #else
1220          entry.IndependentAlphaBlendEnable = true;
1221 #endif
1222       }
1223 
1224       /* The Dual Source Blending documentation says:
1225        *
1226        * "If SRC1 is included in a src/dst blend factor and
1227        * a DualSource RT Write message is not used, results
1228        * are UNDEFINED. (This reflects the same restriction in DX APIs,
1229        * where undefined results are produced if “o1” is not written
1230        * by a PS – there are no default values defined)."
1231        *
1232        * There is no way to gracefully fix this undefined situation
1233        * so we just disable the blending to prevent possible issues.
1234        */
1235       if (!wm_prog_data->dual_src_blend &&
1236           (is_dual_src_blend_factor(a->srcColorBlendFactor) ||
1237            is_dual_src_blend_factor(a->dstColorBlendFactor) ||
1238            is_dual_src_blend_factor(a->srcAlphaBlendFactor) ||
1239            is_dual_src_blend_factor(a->dstAlphaBlendFactor))) {
1240          vk_debug_report(&device->physical->instance->debug_report_callbacks,
1241                          VK_DEBUG_REPORT_WARNING_BIT_EXT,
1242                          VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT,
1243                          (uint64_t)(uintptr_t)device,
1244                          0, 0, "anv",
1245                          "Enabled dual-src blend factors without writing both targets "
1246                          "in the shader.  Disabling blending to avoid GPU hangs.");
1247          entry.ColorBufferBlendEnable = false;
1248       }
1249 
1250       if (a->colorWriteMask != 0)
1251          has_writeable_rt = true;
1252 
1253       /* Our hardware applies the blend factor prior to the blend function
1254        * regardless of what function is used.  Technically, this means the
1255        * hardware can do MORE than GL or Vulkan specify.  However, it also
1256        * means that, for MIN and MAX, we have to stomp the blend factor to
1257        * ONE to make it a no-op.
1258        */
1259       if (a->colorBlendOp == VK_BLEND_OP_MIN ||
1260           a->colorBlendOp == VK_BLEND_OP_MAX) {
1261          entry.SourceBlendFactor = BLENDFACTOR_ONE;
1262          entry.DestinationBlendFactor = BLENDFACTOR_ONE;
1263       }
1264       if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
1265           a->alphaBlendOp == VK_BLEND_OP_MAX) {
1266          entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
1267          entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
1268       }
1269       GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
1270       state_pos += GENX(BLEND_STATE_ENTRY_length);
1271 #if GEN_GEN >= 8
1272       if (i == 0)
1273          bs0 = entry;
1274 #endif
1275    }
1276 
1277 #if GEN_GEN >= 8
1278    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), blend) {
1279       blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
1280       blend.HasWriteableRT                = has_writeable_rt;
1281       blend.ColorBufferBlendEnable        = bs0.ColorBufferBlendEnable;
1282       blend.SourceAlphaBlendFactor        = bs0.SourceAlphaBlendFactor;
1283       blend.DestinationAlphaBlendFactor   = bs0.DestinationAlphaBlendFactor;
1284       blend.SourceBlendFactor             = bs0.SourceBlendFactor;
1285       blend.DestinationBlendFactor        = bs0.DestinationBlendFactor;
1286       blend.AlphaTestEnable               = false;
1287       blend.IndependentAlphaBlendEnable   =
1288          blend_state.IndependentAlphaBlendEnable;
1289    }
1290 #else
1291    (void)has_writeable_rt;
1292 #endif
1293 
1294    GENX(BLEND_STATE_pack)(NULL, pipeline->blend_state.map, &blend_state);
1295 
1296    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
1297       bsp.BlendStatePointer      = pipeline->blend_state.offset;
1298 #if GEN_GEN >= 8
1299       bsp.BlendStatePointerValid = true;
1300 #endif
1301    }
1302 }
1303 
1304 static void
1305 emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
1306                   const VkPipelineInputAssemblyStateCreateInfo *ia_info,
1307                   const VkPipelineViewportStateCreateInfo *vp_info,
1308                   const VkPipelineRasterizationStateCreateInfo *rs_info,
1309                   const uint32_t dynamic_states)
1310 {
1311    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1312    (void) wm_prog_data;
1313 
1314    struct GENX(3DSTATE_CLIP) clip = {
1315       GENX(3DSTATE_CLIP_header),
1316    };
1317 
1318    clip.ClipEnable               = true;
1319    clip.StatisticsEnable         = true;
1320    clip.EarlyCullEnable          = true;
1321    clip.APIMode                  = APIMODE_D3D;
1322    clip.GuardbandClipTestEnable  = true;
1323 
1324    /* Only enable the XY clip test when the final polygon rasterization
1325     * mode is VK_POLYGON_MODE_FILL.  We want to leave it disabled for
1326     * points and lines so we get "pop-free" clipping.
1327     */
1328    VkPolygonMode raster_mode =
1329       anv_raster_polygon_mode(pipeline, ia_info, rs_info);
1330    clip.ViewportXYClipTestEnable = (raster_mode == VK_POLYGON_MODE_FILL);
1331 
1332 #if GEN_GEN >= 8
1333    clip.VertexSubPixelPrecisionSelect = _8Bit;
1334 #endif
1335    clip.ClipMode = CLIPMODE_NORMAL;
1336 
1337    clip.TriangleStripListProvokingVertexSelect = 0;
1338    clip.LineStripListProvokingVertexSelect     = 0;
1339    clip.TriangleFanProvokingVertexSelect       = 1;
1340 
1341    clip.MinimumPointWidth = 0.125;
1342    clip.MaximumPointWidth = 255.875;
1343 
1344    const struct brw_vue_prog_data *last =
1345       anv_pipeline_get_last_vue_prog_data(pipeline);
1346 
1347    /* From the Vulkan 1.0.45 spec:
1348     *
1349     *    "If the last active vertex processing stage shader entry point's
1350     *    interface does not include a variable decorated with
1351     *    ViewportIndex, then the first viewport is used."
1352     */
1353    if (vp_info && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
1354       clip.MaximumVPIndex = vp_info->viewportCount > 0 ?
1355          vp_info->viewportCount - 1 : 0;
1356    } else {
1357       clip.MaximumVPIndex = 0;
1358    }
1359 
1360    /* From the Vulkan 1.0.45 spec:
1361     *
1362     *    "If the last active vertex processing stage shader entry point's
1363     *    interface does not include a variable decorated with Layer, then
1364     *    the first layer is used."
1365     */
1366    clip.ForceZeroRTAIndexEnable =
1367       !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
1368 
1369 #if GEN_GEN == 7
1370    clip.FrontWinding            = genX(vk_to_gen_front_face)[rs_info->frontFace];
1371    clip.CullMode                = genX(vk_to_gen_cullmode)[rs_info->cullMode];
1372    clip.ViewportZClipTestEnable = pipeline->depth_clip_enable;
1373    clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;
1374    clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;
1375 #else
1376    clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
1377       (wm_prog_data->barycentric_interp_modes &
1378        BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0;
1379 #endif
1380 
1381    GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gen7.clip, &clip);
1382 }
1383 
1384 static void
1385 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
1386                        const VkPipelineRasterizationStateCreateInfo *rs_info)
1387 {
1388    const struct brw_vue_prog_data *prog_data =
1389       anv_pipeline_get_last_vue_prog_data(pipeline);
1390    const struct brw_vue_map *vue_map = &prog_data->vue_map;
1391 
1392    nir_xfb_info *xfb_info;
1393    if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1394       xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;
1395    else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
1396       xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
1397    else
1398       xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;
1399 
1400    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), so) {
1401       so.RenderingDisable = rs_info->rasterizerDiscardEnable;
1402 
1403       if (xfb_info) {
1404          so.SOFunctionEnable = true;
1405          so.SOStatisticsEnable = true;
1406 
1407          const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
1408             vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
1409          so.RenderStreamSelect = stream_info ?
1410                                  stream_info->rasterizationStream : 0;
1411 
1412 #if GEN_GEN >= 8
1413          so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
1414          so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
1415          so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
1416          so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
1417 #else
1418          pipeline->gen7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride;
1419          pipeline->gen7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride;
1420          pipeline->gen7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride;
1421          pipeline->gen7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride;
1422 
1423          /* On Gen7, the SO buffer enables live in 3DSTATE_STREAMOUT which
1424           * is a bit inconvenient because we don't know what buffers will
1425           * actually be enabled until draw time.  We do our best here by
1426           * setting them based on buffers_written and we disable them
1427           * as-needed at draw time by setting EndAddress = BaseAddress.
1428           */
1429          so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0);
1430          so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1);
1431          so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2);
1432          so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3);
1433 #endif
1434 
1435          int urb_entry_read_offset = 0;
1436          int urb_entry_read_length =
1437             (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
1438 
1439          /* We always read the whole vertex.  This could be reduced at some
1440           * point by reading less and offsetting the register index in the
1441           * SO_DECLs.
1442           */
1443          so.Stream0VertexReadOffset = urb_entry_read_offset;
1444          so.Stream0VertexReadLength = urb_entry_read_length - 1;
1445          so.Stream1VertexReadOffset = urb_entry_read_offset;
1446          so.Stream1VertexReadLength = urb_entry_read_length - 1;
1447          so.Stream2VertexReadOffset = urb_entry_read_offset;
1448          so.Stream2VertexReadLength = urb_entry_read_length - 1;
1449          so.Stream3VertexReadOffset = urb_entry_read_offset;
1450          so.Stream3VertexReadLength = urb_entry_read_length - 1;
1451       }
1452    }
1453 
1454    if (xfb_info) {
1455       struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
1456       int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
1457       int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
1458 
1459       memset(so_decl, 0, sizeof(so_decl));
1460 
1461       for (unsigned i = 0; i < xfb_info->output_count; i++) {
1462          const nir_xfb_output_info *output = &xfb_info->outputs[i];
1463          unsigned buffer = output->buffer;
1464          unsigned stream = xfb_info->buffer_to_stream[buffer];
1465 
1466          /* Our hardware is unusual in that it requires us to program SO_DECLs
1467           * for fake "hole" components, rather than simply taking the offset
1468           * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
1469           * program as many size = 4 holes as we can, then a final hole to
1470           * accommodate the final 1, 2, or 3 remaining.
1471           */
1472          int hole_dwords = (output->offset - next_offset[buffer]) / 4;
1473          while (hole_dwords > 0) {
1474             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1475                .HoleFlag = 1,
1476                .OutputBufferSlot = buffer,
1477                .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
1478             };
1479             hole_dwords -= 4;
1480          }
1481 
1482          int varying = output->location;
1483          uint8_t component_mask = output->component_mask;
1484          /* VARYING_SLOT_PSIZ contains three scalar fields packed together:
1485           * - VARYING_SLOT_LAYER    in VARYING_SLOT_PSIZ.y
1486           * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z
1487           * - VARYING_SLOT_PSIZ     in VARYING_SLOT_PSIZ.w
1488           */
1489          if (varying == VARYING_SLOT_LAYER) {
1490             varying = VARYING_SLOT_PSIZ;
1491             component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
1492          } else if (varying == VARYING_SLOT_VIEWPORT) {
1493             varying = VARYING_SLOT_PSIZ;
1494             component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
1495          } else if (varying == VARYING_SLOT_PSIZ) {
1496             component_mask = 1 << 3; // SO_DECL_COMPMASK_W
1497          }
1498 
1499          next_offset[buffer] = output->offset +
1500                                __builtin_popcount(component_mask) * 4;
1501 
1502          const int slot = vue_map->varying_to_slot[varying];
1503          if (slot < 0) {
1504             /* This can happen if the shader never writes to the varying.
1505              * Insert a hole instead of actual varying data.
1506              */
1507             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1508                .HoleFlag = true,
1509                .OutputBufferSlot = buffer,
1510                .ComponentMask = component_mask,
1511             };
1512          } else {
1513             so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1514                .OutputBufferSlot = buffer,
1515                .RegisterIndex = slot,
1516                .ComponentMask = component_mask,
1517             };
1518          }
1519       }
1520 
1521       int max_decls = 0;
1522       for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
1523          max_decls = MAX2(max_decls, decls[s]);
1524 
1525       uint8_t sbs[MAX_XFB_STREAMS] = { };
1526       for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
1527          if (xfb_info->buffers_written & (1 << b))
1528             sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
1529       }
1530 
1531       uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls,
1532                                      GENX(3DSTATE_SO_DECL_LIST),
1533                                      .StreamtoBufferSelects0 = sbs[0],
1534                                      .StreamtoBufferSelects1 = sbs[1],
1535                                      .StreamtoBufferSelects2 = sbs[2],
1536                                      .StreamtoBufferSelects3 = sbs[3],
1537                                      .NumEntries0 = decls[0],
1538                                      .NumEntries1 = decls[1],
1539                                      .NumEntries2 = decls[2],
1540                                      .NumEntries3 = decls[3]);
1541 
1542       for (int i = 0; i < max_decls; i++) {
1543          GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
1544             &(struct GENX(SO_DECL_ENTRY)) {
1545                .Stream0Decl = so_decl[0][i],
1546                .Stream1Decl = so_decl[1][i],
1547                .Stream2Decl = so_decl[2][i],
1548                .Stream3Decl = so_decl[3][i],
1549             });
1550       }
1551    }
1552 }
1553 
1554 static uint32_t
1555 get_sampler_count(const struct anv_shader_bin *bin)
1556 {
1557    uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4);
1558 
1559    /* We can potentially have way more than 32 samplers and that's ok.
1560     * However, the 3DSTATE_XS packets only have 3 bits to specify how
1561     * many to pre-fetch and all values above 4 are marked reserved.
1562     */
1563    return MIN2(count_by_4, 4);
1564 }
1565 
1566 static uint32_t
1567 get_binding_table_entry_count(const struct anv_shader_bin *bin)
1568 {
1569    return DIV_ROUND_UP(bin->bind_map.surface_count, 32);
1570 }
1571 
1572 static struct anv_address
1573 get_scratch_address(struct anv_pipeline *pipeline,
1574                     gl_shader_stage stage,
1575                     const struct anv_shader_bin *bin)
1576 {
1577    return (struct anv_address) {
1578       .bo = anv_scratch_pool_alloc(pipeline->device,
1579                                    &pipeline->device->scratch_pool,
1580                                    stage, bin->prog_data->total_scratch),
1581       .offset = 0,
1582    };
1583 }
1584 
1585 static uint32_t
1586 get_scratch_space(const struct anv_shader_bin *bin)
1587 {
1588    return ffs(bin->prog_data->total_scratch / 2048);
1589 }
1590 
1591 static void
1592 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
1593 {
1594    const struct gen_device_info *devinfo = &pipeline->base.device->info;
1595    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1596    const struct anv_shader_bin *vs_bin =
1597       pipeline->shaders[MESA_SHADER_VERTEX];
1598 
1599    assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
1600 
1601    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) {
1602       vs.Enable               = true;
1603       vs.StatisticsEnable     = true;
1604       vs.KernelStartPointer   = vs_bin->kernel.offset;
1605 #if GEN_GEN >= 8
1606       vs.SIMD8DispatchEnable  =
1607          vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
1608 #endif
1609 
1610       assert(!vs_prog_data->base.base.use_alt_mode);
1611 #if GEN_GEN < 11
1612       vs.SingleVertexDispatch       = false;
1613 #endif
1614       vs.VectorMaskEnable           = false;
1615       /* WA_1606682166:
1616        * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
1617        * Disable the Sampler state prefetch functionality in the SARB by
1618        * programming 0xB000[30] to '1'.
1619        */
1620       vs.SamplerCount               = GEN_GEN == 11 ? 0 : get_sampler_count(vs_bin);
1621       vs.BindingTableEntryCount     = get_binding_table_entry_count(vs_bin);
1622       vs.FloatingPointMode          = IEEE754;
1623       vs.IllegalOpcodeExceptionEnable = false;
1624       vs.SoftwareExceptionEnable    = false;
1625       vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;
1626 
1627       if (GEN_GEN == 9 && devinfo->gt == 4 &&
1628           anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1629          /* On Sky Lake GT4, we have experienced some hangs related to the VS
1630           * cache and tessellation.  It is unknown exactly what is happening
1631           * but the Haswell docs for the "VS Reference Count Full Force Miss
1632           * Enable" field of the "Thread Mode" register refer to a HSW bug in
1633           * which the VUE handle reference count would overflow resulting in
1634           * internal reference counting bugs.  My (Jason's) best guess is that
1635           * this bug cropped back up on SKL GT4 when we suddenly had more
1636           * threads in play than any previous gen9 hardware.
1637           *
1638           * What we do know for sure is that setting this bit when
1639           * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
1640           * City when playing with DXVK (https://bugs.freedesktop.org/107280).
1641           * Disabling the vertex cache with tessellation shaders should only
1642           * have a minor performance impact as the tessellation shaders are
1643           * likely generating and processing far more geometry than the vertex
1644           * stage.
1645           */
1646          vs.VertexCacheDisable = true;
1647       }
1648 
1649       vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
1650       vs.VertexURBEntryReadOffset      = 0;
1651       vs.DispatchGRFStartRegisterForURBData =
1652          vs_prog_data->base.base.dispatch_grf_start_reg;
1653 
1654 #if GEN_GEN >= 8
1655       vs.UserClipDistanceClipTestEnableBitmask =
1656          vs_prog_data->base.clip_distance_mask;
1657       vs.UserClipDistanceCullTestEnableBitmask =
1658          vs_prog_data->base.cull_distance_mask;
1659 #endif
1660 
1661       vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
1662       vs.ScratchSpaceBasePointer =
1663          get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
1664    }
1665 }
1666 
1667 static void
1668 emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
1669                       const VkPipelineTessellationStateCreateInfo *tess_info)
1670 {
1671    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1672       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs);
1673       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te);
1674       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds);
1675       return;
1676    }
1677 
1678    const struct gen_device_info *devinfo = &pipeline->base.device->info;
1679    const struct anv_shader_bin *tcs_bin =
1680       pipeline->shaders[MESA_SHADER_TESS_CTRL];
1681    const struct anv_shader_bin *tes_bin =
1682       pipeline->shaders[MESA_SHADER_TESS_EVAL];
1683 
1684    const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1685    const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1686 
1687    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) {
1688       hs.Enable = true;
1689       hs.StatisticsEnable = true;
1690       hs.KernelStartPointer = tcs_bin->kernel.offset;
1691       /* WA_1606682166 */
1692       hs.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(tcs_bin);
1693       hs.BindingTableEntryCount = get_binding_table_entry_count(tcs_bin);
1694 
1695 #if GEN_GEN >= 12
1696       /* GEN:BUG:1604578095:
1697        *
1698        *    Hang occurs when the number of max threads is less than 2 times
1699        *    the number of instance count. The number of max threads must be
1700        *    more than 2 times the number of instance count.
1701        */
1702       assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
1703 #endif
1704 
1705       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1706       hs.IncludeVertexHandles = true;
1707       hs.InstanceCount = tcs_prog_data->instances - 1;
1708 
1709       hs.VertexURBEntryReadLength = 0;
1710       hs.VertexURBEntryReadOffset = 0;
1711       hs.DispatchGRFStartRegisterForURBData =
1712          tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
1713 #if GEN_GEN >= 12
1714       hs.DispatchGRFStartRegisterForURBData5 =
1715          tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
1716 #endif
1717 
1718 
1719       hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1720       hs.ScratchSpaceBasePointer =
1721          get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
1722 
1723 #if GEN_GEN == 12
1724       /*  Patch Count threshold specifies the maximum number of patches that
1725        *  will be accumulated before a thread dispatch is forced.
1726        */
1727       hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
1728 #endif
1729 
1730 #if GEN_GEN >= 9
1731       hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
1732       hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
1733 #endif
1734    }
1735 
1736    const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
1737       tess_info ? vk_find_struct_const(tess_info, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO) : NULL;
1738 
1739    VkTessellationDomainOrigin uv_origin =
1740       domain_origin_state ? domain_origin_state->domainOrigin :
1741                             VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
1742 
1743    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {
1744       te.Partitioning = tes_prog_data->partitioning;
1745 
1746       if (uv_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
1747          te.OutputTopology = tes_prog_data->output_topology;
1748       } else {
1749          /* When the origin is upper-left, we have to flip the winding order */
1750          if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
1751             te.OutputTopology = OUTPUT_TRI_CW;
1752          } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
1753             te.OutputTopology = OUTPUT_TRI_CCW;
1754          } else {
1755             te.OutputTopology = tes_prog_data->output_topology;
1756          }
1757       }
1758 
1759       te.TEDomain = tes_prog_data->domain;
1760       te.TEEnable = true;
1761       te.MaximumTessellationFactorOdd = 63.0;
1762       te.MaximumTessellationFactorNotOdd = 64.0;
1763    }
1764 
1765    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {
1766       ds.Enable = true;
1767       ds.StatisticsEnable = true;
1768       ds.KernelStartPointer = tes_bin->kernel.offset;
1769       /* WA_1606682166 */
1770       ds.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(tes_bin);
1771       ds.BindingTableEntryCount = get_binding_table_entry_count(tes_bin);
1772       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1773 
1774       ds.ComputeWCoordinateEnable =
1775          tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
1776 
1777       ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1778       ds.PatchURBEntryReadOffset = 0;
1779       ds.DispatchGRFStartRegisterForURBData =
1780          tes_prog_data->base.base.dispatch_grf_start_reg;
1781 
1782 #if GEN_GEN >= 8
1783 #if GEN_GEN < 11
1784       ds.DispatchMode =
1785          tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1786             DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1787             DISPATCH_MODE_SIMD4X2;
1788 #else
1789       assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
1790       ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
1791 #endif
1792 
1793       ds.UserClipDistanceClipTestEnableBitmask =
1794          tes_prog_data->base.clip_distance_mask;
1795       ds.UserClipDistanceCullTestEnableBitmask =
1796          tes_prog_data->base.cull_distance_mask;
1797 #endif
1798 
1799       ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1800       ds.ScratchSpaceBasePointer =
1801          get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
1802    }
1803 }
1804 
1805 static void
1806 emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
1807 {
1808    const struct gen_device_info *devinfo = &pipeline->base.device->info;
1809    const struct anv_shader_bin *gs_bin =
1810       pipeline->shaders[MESA_SHADER_GEOMETRY];
1811 
1812    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1813       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs);
1814       return;
1815    }
1816 
1817    const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
1818 
1819    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) {
1820       gs.Enable                  = true;
1821       gs.StatisticsEnable        = true;
1822       gs.KernelStartPointer      = gs_bin->kernel.offset;
1823       gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
1824 
1825       gs.SingleProgramFlow       = false;
1826       gs.VectorMaskEnable        = false;
1827       /* WA_1606682166 */
1828       gs.SamplerCount            = GEN_GEN == 11 ? 0 : get_sampler_count(gs_bin);
1829       gs.BindingTableEntryCount  = get_binding_table_entry_count(gs_bin);
1830       gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
1831       gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
1832 
1833       if (GEN_GEN == 8) {
1834          /* Broadwell is weird.  It needs us to divide by 2. */
1835          gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;
1836       } else {
1837          gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
1838       }
1839 
1840       gs.OutputVertexSize        = gs_prog_data->output_vertex_size_hwords * 2 - 1;
1841       gs.OutputTopology          = gs_prog_data->output_topology;
1842       gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1843       gs.ControlDataFormat       = gs_prog_data->control_data_format;
1844       gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
1845       gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
1846       gs.ReorderMode             = TRAILING;
1847 
1848 #if GEN_GEN >= 8
1849       gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
1850       gs.StaticOutput            = gs_prog_data->static_vertex_count >= 0;
1851       gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
1852                                    gs_prog_data->static_vertex_count : 0;
1853 #endif
1854 
1855       gs.VertexURBEntryReadOffset = 0;
1856       gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1857       gs.DispatchGRFStartRegisterForURBData =
1858          gs_prog_data->base.base.dispatch_grf_start_reg;
1859 
1860 #if GEN_GEN >= 8
1861       gs.UserClipDistanceClipTestEnableBitmask =
1862          gs_prog_data->base.clip_distance_mask;
1863       gs.UserClipDistanceCullTestEnableBitmask =
1864          gs_prog_data->base.cull_distance_mask;
1865 #endif
1866 
1867       gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
1868       gs.ScratchSpaceBasePointer =
1869          get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
1870    }
1871 }
1872 
1873 static bool
1874 has_color_buffer_write_enabled(const struct anv_graphics_pipeline *pipeline,
1875                                const VkPipelineColorBlendStateCreateInfo *blend)
1876 {
1877    const struct anv_shader_bin *shader_bin =
1878       pipeline->shaders[MESA_SHADER_FRAGMENT];
1879    if (!shader_bin)
1880       return false;
1881 
1882    const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;
1883    for (int i = 0; i < bind_map->surface_count; i++) {
1884       struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[i];
1885 
1886       if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
1887          continue;
1888 
1889       if (binding->index == UINT32_MAX)
1890          continue;
1891 
1892       if (blend && blend->pAttachments[binding->index].colorWriteMask != 0)
1893          return true;
1894    }
1895 
1896    return false;
1897 }
1898 
1899 static void
1900 emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, struct anv_subpass *subpass,
1901                 const VkPipelineInputAssemblyStateCreateInfo *ia,
1902                 const VkPipelineRasterizationStateCreateInfo *raster,
1903                 const VkPipelineColorBlendStateCreateInfo *blend,
1904                 const VkPipelineMultisampleStateCreateInfo *multisample,
1905                 const VkPipelineRasterizationLineStateCreateInfoEXT *line)
1906 {
1907    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1908 
1909    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), wm) {
1910       wm.StatisticsEnable                    = true;
1911       wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
1912       wm.LineAntialiasingRegionWidth         = _10pixels;
1913       wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
1914 
1915       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1916          if (wm_prog_data->early_fragment_tests) {
1917             wm.EarlyDepthStencilControl         = EDSC_PREPS;
1918          } else if (wm_prog_data->has_side_effects) {
1919             wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
1920          } else {
1921             wm.EarlyDepthStencilControl         = EDSC_NORMAL;
1922          }
1923 
1924 #if GEN_GEN >= 8
1925          /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
1926           * doesn't take into account KillPixels when no depth or stencil
1927           * writes are enabled.  In order for occlusion queries to work
1928           * correctly with no attachments, we need to force-enable PS thread
1929           * dispatch.
1930           *
1931           * The BDW docs are pretty clear that that this bit isn't validated
1932           * and probably shouldn't be used in production:
1933           *
1934           *    "This must always be set to Normal. This field should not be
1935           *    tested for functional validation."
1936           *
1937           * Unfortunately, however, the other mechanism we have for doing this
1938           * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
1939           * Given two bad options, we choose the one which works.
1940           */
1941          if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
1942              !has_color_buffer_write_enabled(pipeline, blend))
1943             wm.ForceThreadDispatchEnable = ForceON;
1944 #endif
1945 
1946          wm.BarycentricInterpolationMode =
1947             wm_prog_data->barycentric_interp_modes;
1948 
1949 #if GEN_GEN < 8
1950          wm.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
1951          wm.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
1952          wm.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
1953          wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
1954 
1955          /* If the subpass has a depth or stencil self-dependency, then we
1956           * need to force the hardware to do the depth/stencil write *after*
1957           * fragment shader execution.  Otherwise, the writes may hit memory
1958           * before we get around to fetching from the input attachment and we
1959           * may get the depth or stencil value from the current draw rather
1960           * than the previous one.
1961           */
1962          wm.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
1963                                             wm_prog_data->uses_kill;
1964 
1965          if (wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
1966              wm_prog_data->has_side_effects ||
1967              wm.PixelShaderKillsPixel ||
1968              has_color_buffer_write_enabled(pipeline, blend))
1969             wm.ThreadDispatchEnable = true;
1970 
1971          if (multisample && multisample->rasterizationSamples > 1) {
1972             if (wm_prog_data->persample_dispatch) {
1973                wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1974             } else {
1975                wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
1976             }
1977          } else {
1978             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1979          }
1980          wm.MultisampleRasterizationMode =
1981             gen7_ms_rast_mode(pipeline, ia, raster, multisample);
1982 #endif
1983 
1984          wm.LineStippleEnable = line && line->stippledLineEnable;
1985       }
1986    }
1987 }
1988 
1989 static void
1990 emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
1991                 const VkPipelineColorBlendStateCreateInfo *blend,
1992                 const VkPipelineMultisampleStateCreateInfo *multisample)
1993 {
1994    UNUSED const struct gen_device_info *devinfo = &pipeline->base.device->info;
1995    const struct anv_shader_bin *fs_bin =
1996       pipeline->shaders[MESA_SHADER_FRAGMENT];
1997 
1998    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1999       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
2000 #if GEN_GEN == 7
2001          /* Even if no fragments are ever dispatched, gen7 hardware hangs if
2002           * we don't at least set the maximum number of threads.
2003           */
2004          ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
2005 #endif
2006       }
2007       return;
2008    }
2009 
2010    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2011 
2012 #if GEN_GEN < 8
2013    /* The hardware wedges if you have this bit set but don't turn on any dual
2014     * source blend factors.
2015     */
2016    bool dual_src_blend = false;
2017    if (wm_prog_data->dual_src_blend && blend) {
2018       for (uint32_t i = 0; i < blend->attachmentCount; i++) {
2019          const VkPipelineColorBlendAttachmentState *bstate =
2020             &blend->pAttachments[i];
2021 
2022          if (bstate->blendEnable &&
2023              (is_dual_src_blend_factor(bstate->srcColorBlendFactor) ||
2024               is_dual_src_blend_factor(bstate->dstColorBlendFactor) ||
2025               is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) ||
2026               is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) {
2027             dual_src_blend = true;
2028             break;
2029          }
2030       }
2031    }
2032 #endif
2033 
2034    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
2035       ps._8PixelDispatchEnable      = wm_prog_data->dispatch_8;
2036       ps._16PixelDispatchEnable     = wm_prog_data->dispatch_16;
2037       ps._32PixelDispatchEnable     = wm_prog_data->dispatch_32;
2038 
2039       /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
2040        *
2041        *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
2042        *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
2043        *
2044        * Since 16x MSAA is first introduced on SKL, we don't need to apply
2045        * the workaround on any older hardware.
2046        */
2047       if (GEN_GEN >= 9 && !wm_prog_data->persample_dispatch &&
2048           multisample && multisample->rasterizationSamples == 16) {
2049          assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
2050          ps._32PixelDispatchEnable = false;
2051       }
2052 
2053       ps.KernelStartPointer0 = fs_bin->kernel.offset +
2054                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
2055       ps.KernelStartPointer1 = fs_bin->kernel.offset +
2056                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
2057       ps.KernelStartPointer2 = fs_bin->kernel.offset +
2058                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
2059 
2060       ps.SingleProgramFlow          = false;
2061       ps.VectorMaskEnable           = GEN_GEN >= 8;
2062       /* WA_1606682166 */
2063       ps.SamplerCount               = GEN_GEN == 11 ? 0 : get_sampler_count(fs_bin);
2064       ps.BindingTableEntryCount     = get_binding_table_entry_count(fs_bin);
2065       ps.PushConstantEnable         = wm_prog_data->base.nr_params > 0 ||
2066                                       wm_prog_data->base.ubo_ranges[0].length;
2067       ps.PositionXYOffsetSelect     = wm_prog_data->uses_pos_offset ?
2068                                       POSOFFSET_SAMPLE: POSOFFSET_NONE;
2069 #if GEN_GEN < 8
2070       ps.AttributeEnable            = wm_prog_data->num_varying_inputs > 0;
2071       ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
2072       ps.DualSourceBlendEnable      = dual_src_blend;
2073 #endif
2074 
2075 #if GEN_IS_HASWELL
2076       /* Haswell requires the sample mask to be set in this packet as well
2077        * as in 3DSTATE_SAMPLE_MASK; the values should match.
2078        */
2079       ps.SampleMask                 = 0xff;
2080 #endif
2081 
2082 #if GEN_GEN >= 9
2083       ps.MaximumNumberofThreadsPerPSD  = 64 - 1;
2084 #elif GEN_GEN >= 8
2085       ps.MaximumNumberofThreadsPerPSD  = 64 - 2;
2086 #else
2087       ps.MaximumNumberofThreads        = devinfo->max_wm_threads - 1;
2088 #endif
2089 
2090       ps.DispatchGRFStartRegisterForConstantSetupData0 =
2091          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
2092       ps.DispatchGRFStartRegisterForConstantSetupData1 =
2093          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
2094       ps.DispatchGRFStartRegisterForConstantSetupData2 =
2095          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
2096 
2097       ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
2098       ps.ScratchSpaceBasePointer =
2099          get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
2100    }
2101 }
2102 
2103 #if GEN_GEN >= 8
2104 static void
2105 emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
2106                       struct anv_subpass *subpass)
2107 {
2108    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2109 
2110    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2111       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps);
2112       return;
2113    }
2114 
2115    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) {
2116       ps.PixelShaderValid              = true;
2117       ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
2118       ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
2119       ps.PixelShaderIsPerSample        = wm_prog_data->persample_dispatch;
2120       ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
2121       ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
2122       ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
2123 
2124       /* If the subpass has a depth or stencil self-dependency, then we need
2125        * to force the hardware to do the depth/stencil write *after* fragment
2126        * shader execution.  Otherwise, the writes may hit memory before we get
2127        * around to fetching from the input attachment and we may get the depth
2128        * or stencil value from the current draw rather than the previous one.
2129        */
2130       ps.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
2131                                          wm_prog_data->uses_kill;
2132 
2133 #if GEN_GEN >= 9
2134       ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
2135       ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
2136 
2137       ps.InputCoverageMaskState  = ICMS_NONE;
2138       if (wm_prog_data->uses_sample_mask) {
2139          if (wm_prog_data->post_depth_coverage)
2140             ps.InputCoverageMaskState  = ICMS_DEPTH_COVERAGE;
2141          else
2142             ps.InputCoverageMaskState  = ICMS_INNER_CONSERVATIVE;
2143       }
2144 #else
2145       ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2146 #endif
2147    }
2148 }
2149 
2150 static void
2151 emit_3dstate_vf_topology(struct anv_graphics_pipeline *pipeline)
2152 {
2153    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
2154       vft.PrimitiveTopologyType = pipeline->topology;
2155    }
2156 }
2157 #endif
2158 
2159 static void
2160 emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
2161 {
2162    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
2163       vfs.StatisticsEnable = true;
2164    }
2165 }
2166 
2167 static void
2168 compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
2169                    const VkPipelineMultisampleStateCreateInfo *ms_info,
2170                    const struct anv_subpass *subpass)
2171 {
2172    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2173       pipeline->kill_pixel = false;
2174       return;
2175    }
2176 
2177    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2178 
2179    /* This computes the KillPixel portion of the computation for whether or
2180     * not we want to enable the PMA fix on gen8 or gen9.  It's given by this
2181     * chunk of the giant formula:
2182     *
2183     *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2184     *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2185     *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2186     *     3DSTATE_PS_BLEND::AlphaTestEnable ||
2187     *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2188     *
2189     * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
2190     * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
2191     * of an alpha test.
2192     */
2193    pipeline->kill_pixel =
2194       subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
2195       wm_prog_data->uses_omask ||
2196       (ms_info && ms_info->alphaToCoverageEnable);
2197 }
2198 
2199 #if GEN_GEN == 12
2200 static void
2201 emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)
2202 {
2203    if (!pipeline->use_primitive_replication) {
2204       anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
2205       return;
2206    }
2207 
2208    uint32_t view_mask = pipeline->subpass->view_mask;
2209    int view_count = util_bitcount(view_mask);
2210    assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
2211 
2212    anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
2213       pr.ReplicaMask = (1 << view_count) - 1;
2214       pr.ReplicationCount = view_count - 1;
2215 
2216       int i = 0, view_index;
2217       for_each_bit(view_index, view_mask) {
2218          pr.RTAIOffset[i] = view_index;
2219          i++;
2220       }
2221    }
2222 }
2223 #endif
2224 
2225 static VkResult
2226 genX(graphics_pipeline_create)(
2227     VkDevice                                    _device,
2228     struct anv_pipeline_cache *                 cache,
2229     const VkGraphicsPipelineCreateInfo*         pCreateInfo,
2230     const VkAllocationCallbacks*                pAllocator,
2231     VkPipeline*                                 pPipeline)
2232 {
2233    ANV_FROM_HANDLE(anv_device, device, _device);
2234    ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
2235    struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
2236    struct anv_graphics_pipeline *pipeline;
2237    VkResult result;
2238 
2239    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
2240 
2241    /* Use the default pipeline cache if none is specified */
2242    if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2243       cache = &device->default_pipeline_cache;
2244 
2245    pipeline = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
2246                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2247    if (pipeline == NULL)
2248       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
2249 
2250    result = anv_graphics_pipeline_init(pipeline, device, cache,
2251                                        pCreateInfo, pAllocator);
2252    if (result != VK_SUCCESS) {
2253       vk_free2(&device->vk.alloc, pAllocator, pipeline);
2254       if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
2255          *pPipeline = VK_NULL_HANDLE;
2256       return result;
2257    }
2258 
2259    /* If rasterization is not enabled, various CreateInfo structs must be
2260     * ignored.
2261     */
2262    const bool raster_enabled =
2263       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
2264 
2265    const VkPipelineViewportStateCreateInfo *vp_info =
2266       raster_enabled ? pCreateInfo->pViewportState : NULL;
2267 
2268    const VkPipelineMultisampleStateCreateInfo *ms_info =
2269       raster_enabled ? pCreateInfo->pMultisampleState : NULL;
2270 
2271    const VkPipelineDepthStencilStateCreateInfo *ds_info =
2272       raster_enabled ? pCreateInfo->pDepthStencilState : NULL;
2273 
2274    const VkPipelineColorBlendStateCreateInfo *cb_info =
2275       raster_enabled ? pCreateInfo->pColorBlendState : NULL;
2276 
2277    const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
2278       vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
2279                            PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
2280 
2281    /* Information on which states are considered dynamic. */
2282    const VkPipelineDynamicStateCreateInfo *dyn_info =
2283       pCreateInfo->pDynamicState;
2284    uint32_t dynamic_states = 0;
2285    if (dyn_info) {
2286       for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)
2287          dynamic_states |=
2288             anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);
2289    }
2290 
2291    enum gen_urb_deref_block_size urb_deref_block_size;
2292    emit_urb_setup(pipeline, &urb_deref_block_size);
2293 
2294    assert(pCreateInfo->pVertexInputState);
2295    emit_vertex_input(pipeline, pCreateInfo->pVertexInputState);
2296    assert(pCreateInfo->pRasterizationState);
2297    emit_rs_state(pipeline, pCreateInfo->pInputAssemblyState,
2298                            pCreateInfo->pRasterizationState,
2299                            ms_info, line_info, dynamic_states, pass, subpass,
2300                            urb_deref_block_size);
2301    emit_ms_state(pipeline, ms_info);
2302    emit_ds_state(pipeline, ds_info, dynamic_states, pass, subpass);
2303    emit_cb_state(pipeline, cb_info, ms_info);
2304    compute_kill_pixel(pipeline, ms_info, subpass);
2305 
2306    emit_3dstate_clip(pipeline,
2307                      pCreateInfo->pInputAssemblyState,
2308                      vp_info,
2309                      pCreateInfo->pRasterizationState,
2310                      dynamic_states);
2311    emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState);
2312 
2313 #if GEN_GEN == 12
2314    emit_3dstate_primitive_replication(pipeline);
2315 #endif
2316 
2317 #if 0
2318    /* From gen7_vs_state.c */
2319 
2320    /**
2321     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2322     * Geometry > Geometry Shader > State:
2323     *
2324     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2325     *     whole fixed function pipeline when the GS enable changes value in
2326     *     the 3DSTATE_GS."
2327     *
2328     * The hardware architects have clarified that in this context "flush the
2329     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2330     * Stall" bit set.
2331     */
2332    if (!device->info.is_haswell && !device->info.is_baytrail)
2333       gen7_emit_vs_workaround_flush(brw);
2334 #endif
2335 
2336    emit_3dstate_vs(pipeline);
2337    emit_3dstate_hs_te_ds(pipeline, pCreateInfo->pTessellationState);
2338    emit_3dstate_gs(pipeline);
2339    emit_3dstate_sbe(pipeline);
2340    emit_3dstate_wm(pipeline, subpass,
2341                    pCreateInfo->pInputAssemblyState,
2342                    pCreateInfo->pRasterizationState,
2343                    cb_info, ms_info, line_info);
2344    emit_3dstate_ps(pipeline, cb_info, ms_info);
2345 #if GEN_GEN >= 8
2346    emit_3dstate_ps_extra(pipeline, subpass);
2347 
2348    if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))
2349       emit_3dstate_vf_topology(pipeline);
2350 #endif
2351    emit_3dstate_vf_statistics(pipeline);
2352 
2353    *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2354 
2355    return pipeline->base.batch.status;
2356 }
2357 
2358 static void
2359 emit_media_cs_state(struct anv_compute_pipeline *pipeline,
2360                     const struct anv_device *device)
2361 {
2362    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2363 
2364    anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2365 
2366    const struct anv_cs_parameters cs_params = anv_cs_parameters(pipeline);
2367 
2368    pipeline->cs_right_mask = brw_cs_right_mask(cs_params.group_size, cs_params.simd_size);
2369 
2370    const uint32_t vfe_curbe_allocation =
2371       ALIGN(cs_prog_data->push.per_thread.regs * cs_params.threads +
2372             cs_prog_data->push.cross_thread.regs, 2);
2373 
2374    const uint32_t subslices = MAX2(device->physical->subslice_total, 1);
2375 
2376    const struct anv_shader_bin *cs_bin = pipeline->cs;
2377    const struct gen_device_info *devinfo = &device->info;
2378 
2379    anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
2380 #if GEN_GEN > 7
2381       vfe.StackSize              = 0;
2382 #else
2383       vfe.GPGPUMode              = true;
2384 #endif
2385       vfe.MaximumNumberofThreads =
2386          devinfo->max_cs_threads * subslices - 1;
2387       vfe.NumberofURBEntries     = GEN_GEN <= 7 ? 0 : 2;
2388 #if GEN_GEN < 11
2389       vfe.ResetGatewayTimer      = true;
2390 #endif
2391 #if GEN_GEN <= 8
2392       vfe.BypassGatewayControl   = true;
2393 #endif
2394       vfe.URBEntryAllocationSize = GEN_GEN <= 7 ? 0 : 2;
2395       vfe.CURBEAllocationSize    = vfe_curbe_allocation;
2396 
2397       if (cs_bin->prog_data->total_scratch) {
2398          if (GEN_GEN >= 8) {
2399             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
2400              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
2401              */
2402             vfe.PerThreadScratchSpace =
2403                ffs(cs_bin->prog_data->total_scratch) - 11;
2404          } else if (GEN_IS_HASWELL) {
2405             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
2406              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
2407              */
2408             vfe.PerThreadScratchSpace =
2409                ffs(cs_bin->prog_data->total_scratch) - 12;
2410          } else {
2411             /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
2412              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
2413              */
2414             vfe.PerThreadScratchSpace =
2415                cs_bin->prog_data->total_scratch / 1024 - 1;
2416          }
2417          vfe.ScratchSpaceBasePointer =
2418             get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2419       }
2420    }
2421 
2422    struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
2423       .KernelStartPointer     =
2424          cs_bin->kernel.offset +
2425          brw_cs_prog_data_prog_offset(cs_prog_data, cs_params.simd_size),
2426 
2427       /* WA_1606682166 */
2428       .SamplerCount           = GEN_GEN == 11 ? 0 : get_sampler_count(cs_bin),
2429       /* We add 1 because the CS indirect parameters buffer isn't accounted
2430        * for in bind_map.surface_count.
2431        */
2432       .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
2433       .BarrierEnable          = cs_prog_data->uses_barrier,
2434       .SharedLocalMemorySize  =
2435          encode_slm_size(GEN_GEN, cs_prog_data->base.total_shared),
2436 
2437 #if !GEN_IS_HASWELL
2438       .ConstantURBEntryReadOffset = 0,
2439 #endif
2440       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2441 #if GEN_GEN >= 8 || GEN_IS_HASWELL
2442       .CrossThreadConstantDataReadLength =
2443          cs_prog_data->push.cross_thread.regs,
2444 #endif
2445 #if GEN_GEN >= 12
2446       /* TODO: Check if we are missing workarounds and enable mid-thread
2447        * preemption.
2448        *
2449        * We still have issues with mid-thread preemption (it was already
2450        * disabled by the kernel on gen11, due to missing workarounds). It's
2451        * possible that we are just missing some workarounds, and could enable
2452        * it later, but for now let's disable it to fix a GPU in compute in Car
2453        * Chase (and possibly more).
2454        */
2455       .ThreadPreemptionDisable = true,
2456 #endif
2457 
2458       .NumberofThreadsinGPGPUThreadGroup = cs_params.threads,
2459    };
2460    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
2461                                         pipeline->interface_descriptor_data,
2462                                         &desc);
2463 }
2464 
2465 static VkResult
2466 compute_pipeline_create(
2467     VkDevice                                    _device,
2468     struct anv_pipeline_cache *                 cache,
2469     const VkComputePipelineCreateInfo*          pCreateInfo,
2470     const VkAllocationCallbacks*                pAllocator,
2471     VkPipeline*                                 pPipeline)
2472 {
2473    ANV_FROM_HANDLE(anv_device, device, _device);
2474    struct anv_compute_pipeline *pipeline;
2475    VkResult result;
2476 
2477    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
2478 
2479    /* Use the default pipeline cache if none is specified */
2480    if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2481       cache = &device->default_pipeline_cache;
2482 
2483    pipeline = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
2484                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2485    if (pipeline == NULL)
2486       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
2487 
2488    result = anv_pipeline_init(&pipeline->base, device,
2489                               ANV_PIPELINE_COMPUTE, pCreateInfo->flags,
2490                               pAllocator);
2491    if (result != VK_SUCCESS) {
2492       vk_free2(&device->vk.alloc, pAllocator, pipeline);
2493       return result;
2494    }
2495 
2496    anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
2497                          pipeline->batch_data, sizeof(pipeline->batch_data));
2498 
2499    pipeline->cs = NULL;
2500 
2501    assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
2502    ANV_FROM_HANDLE(anv_shader_module, module,  pCreateInfo->stage.module);
2503    result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
2504                                     pCreateInfo->stage.pName,
2505                                     pCreateInfo->stage.pSpecializationInfo);
2506    if (result != VK_SUCCESS) {
2507       anv_pipeline_finish(&pipeline->base, device, pAllocator);
2508       vk_free2(&device->vk.alloc, pAllocator, pipeline);
2509       if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
2510          *pPipeline = VK_NULL_HANDLE;
2511       return result;
2512    }
2513 
2514    emit_media_cs_state(pipeline, device);
2515 
2516    *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2517 
2518    return pipeline->base.batch.status;
2519 }
2520 
2521 VkResult genX(CreateGraphicsPipelines)(
2522     VkDevice                                    _device,
2523     VkPipelineCache                             pipelineCache,
2524     uint32_t                                    count,
2525     const VkGraphicsPipelineCreateInfo*         pCreateInfos,
2526     const VkAllocationCallbacks*                pAllocator,
2527     VkPipeline*                                 pPipelines)
2528 {
2529    ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2530 
2531    VkResult result = VK_SUCCESS;
2532 
2533    unsigned i;
2534    for (i = 0; i < count; i++) {
2535       VkResult res = genX(graphics_pipeline_create)(_device,
2536                                                     pipeline_cache,
2537                                                     &pCreateInfos[i],
2538                                                     pAllocator, &pPipelines[i]);
2539 
2540       if (res == VK_SUCCESS)
2541          continue;
2542 
2543       /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
2544        * is not obvious what error should be report upon 2 different failures.
2545        * */
2546       result = res;
2547       if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
2548          break;
2549 
2550       if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
2551          break;
2552    }
2553 
2554    for (; i < count; i++)
2555       pPipelines[i] = VK_NULL_HANDLE;
2556 
2557    return result;
2558 }
2559 
2560 VkResult genX(CreateComputePipelines)(
2561     VkDevice                                    _device,
2562     VkPipelineCache                             pipelineCache,
2563     uint32_t                                    count,
2564     const VkComputePipelineCreateInfo*          pCreateInfos,
2565     const VkAllocationCallbacks*                pAllocator,
2566     VkPipeline*                                 pPipelines)
2567 {
2568    ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2569 
2570    VkResult result = VK_SUCCESS;
2571 
2572    unsigned i;
2573    for (i = 0; i < count; i++) {
2574       VkResult res = compute_pipeline_create(_device, pipeline_cache,
2575                                              &pCreateInfos[i],
2576                                              pAllocator, &pPipelines[i]);
2577 
2578       if (res == VK_SUCCESS)
2579          continue;
2580 
2581       /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
2582        * is not obvious what error should be report upon 2 different failures.
2583        * */
2584       result = res;
2585       if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
2586          break;
2587 
2588       if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
2589          break;
2590    }
2591 
2592    for (; i < count; i++)
2593       pPipelines[i] = VK_NULL_HANDLE;
2594 
2595    return result;
2596 }
2597