• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include "anv_private.h"
31 
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34 #include "common/intel_genX_state_brw.h"
35 #include "common/intel_guardband.h"
36 #include "common/intel_tiled_render.h"
37 #include "compiler/brw_prim.h"
38 
39 static const uint32_t vk_to_intel_blend[] = {
40    [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
41    [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
42    [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
43    [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
44    [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
45    [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
46    [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
47    [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
48    [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
49    [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
50    [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
51    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
52    [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
53    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
54    [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
55    [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
56    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
57    [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
58    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
59 };
60 
61 static const uint32_t vk_to_intel_blend_op[] = {
62    [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
63    [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
64    [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
65    [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
66    [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
67 };
68 
69 static const uint32_t vk_to_intel_cullmode[] = {
70    [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
71    [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
72    [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
73    [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
74 };
75 
76 static const uint32_t vk_to_intel_fillmode[] = {
77    [VK_POLYGON_MODE_FILL]                    = FILL_MODE_SOLID,
78    [VK_POLYGON_MODE_LINE]                    = FILL_MODE_WIREFRAME,
79    [VK_POLYGON_MODE_POINT]                   = FILL_MODE_POINT,
80 };
81 
82 static const uint32_t vk_to_intel_front_face[] = {
83    [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
84    [VK_FRONT_FACE_CLOCKWISE]                 = 0
85 };
86 
87 static const uint32_t vk_to_intel_logic_op[] = {
88    [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
89    [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
90    [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
91    [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
92    [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
93    [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
94    [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
95    [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
96    [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
97    [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
98    [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
99    [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
100    [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
101    [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
102    [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
103    [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
104 };
105 
106 static const uint32_t vk_to_intel_compare_op[] = {
107    [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_NEVER,
108    [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LESS,
109    [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_EQUAL,
110    [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LEQUAL,
111    [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GREATER,
112    [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_NOTEQUAL,
113    [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GEQUAL,
114    [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_ALWAYS,
115 };
116 
117 static const uint32_t vk_to_intel_stencil_op[] = {
118    [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
119    [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
120    [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
121    [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
122    [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
123    [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
124    [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
125    [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
126 };
127 
128 static const uint32_t vk_to_intel_primitive_type[] = {
129    [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
130    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
131    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
132    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
133    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
134    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
135    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
136    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
137    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
138    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
139 };
140 
141 static void
genX(streamout_prologue)142 genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer)
143 {
144 #if INTEL_WA_16013994831_GFX_VER
145    /* Wa_16013994831 - Disable preemption during streamout, enable back
146     * again if XFB not used by the current pipeline.
147     *
148     * Although this workaround applies to Gfx12+, we already disable object
149     * level preemption for another reason in genX_state.c so we can skip this
150     * for Gfx12.
151     */
152    if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
153       return;
154 
155    struct anv_graphics_pipeline *pipeline =
156       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
157    if (pipeline->uses_xfb) {
158       genX(cmd_buffer_set_preemption)(cmd_buffer, false);
159       return;
160    }
161 
162    if (!cmd_buffer->state.gfx.object_preemption)
163       genX(cmd_buffer_set_preemption)(cmd_buffer, true);
164 #endif
165 }
166 
167 #if GFX_VER >= 12 && GFX_VER < 30
168 static uint32_t
get_cps_state_offset(const struct anv_device * device,bool cps_enabled,const struct vk_fragment_shading_rate_state * fsr)169 get_cps_state_offset(const struct anv_device *device, bool cps_enabled,
170                      const struct vk_fragment_shading_rate_state *fsr)
171 {
172    if (!cps_enabled)
173       return device->cps_states.offset;
174 
175    uint32_t offset;
176    static const uint32_t size_index[] = {
177       [1] = 0,
178       [2] = 1,
179       [4] = 2,
180    };
181 
182 #if GFX_VERx10 >= 125
183    offset =
184       1 + /* skip disabled */
185       fsr->combiner_ops[0] * 5 * 3 * 3 +
186       fsr->combiner_ops[1] * 3 * 3 +
187       size_index[fsr->fragment_size.width] * 3 +
188       size_index[fsr->fragment_size.height];
189 #else
190    offset =
191       1 + /* skip disabled */
192       size_index[fsr->fragment_size.width] * 3 +
193       size_index[fsr->fragment_size.height];
194 #endif
195 
196    offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
197 
198    return device->cps_states.offset + offset;
199 }
200 #endif /* GFX_VER >= 12 && GFX_VER < 30 */
201 
202 #if GFX_VER >= 30
203 static uint32_t
get_cps_size(uint32_t size)204 get_cps_size(uint32_t size)
205 {
206    switch (size) {
207    case 1:
208       return CPSIZE_1;
209    case 2:
210       return CPSIZE_2;
211    case 4:
212       return CPSIZE_4;
213    default:
214       unreachable("Invalid size");
215    }
216 }
217 
218 static const uint32_t vk_to_intel_shading_rate_combiner_op[] = {
219    [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR] = CPS_COMB_OP_PASSTHROUGH,
220    [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = CPS_COMB_OP_OVERRIDE,
221    [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR] = CPS_COMB_OP_HIGH_QUALITY,
222    [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR] = CPS_COMB_OP_LOW_QUALITY,
223    [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR] = CPS_COMB_OP_RELATIVE,
224 };
225 #endif
226 
227 static bool
has_ds_feedback_loop(const struct vk_dynamic_graphics_state * dyn)228 has_ds_feedback_loop(const struct vk_dynamic_graphics_state *dyn)
229 {
230    return (dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT |
231                                   VK_IMAGE_ASPECT_STENCIL_BIT)) ||
232       dyn->ial.depth_att != MESA_VK_ATTACHMENT_UNUSED ||
233       dyn->ial.stencil_att != MESA_VK_ATTACHMENT_UNUSED;
234 }
235 
236 UNUSED static bool
want_stencil_pma_fix(const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx,const struct vk_depth_stencil_state * ds)237 want_stencil_pma_fix(const struct vk_dynamic_graphics_state *dyn,
238                      const struct anv_cmd_graphics_state *gfx,
239                      const struct vk_depth_stencil_state *ds)
240 {
241    if (GFX_VER > 9)
242       return false;
243    assert(GFX_VER == 9);
244 
245    /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
246     *
247     *    Clearing this bit will force the STC cache to wait for pending
248     *    retirement of pixels at the HZ-read stage and do the STC-test for
249     *    Non-promoted, R-computed and Computed depth modes instead of
250     *    postponing the STC-test to RCPFE.
251     *
252     *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
253     *                  3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
254     *
255     *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
256     *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
257     *                    3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
258     *
259     *    COMP_STC_EN = STC_TEST_EN &&
260     *                  3DSTATE_PS_EXTRA::PixelShaderComputesStencil
261     *
262     *    SW parses the pipeline states to generate the following logical
263     *    signal indicating if PMA FIX can be enabled.
264     *
265     *    STC_PMA_OPT =
266     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
267     *       !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
268     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
269     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
270     *       !(3DSTATE_WM::EDSC_Mode == 2) &&
271     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
272     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
273     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
274     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
275     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
276     *       (COMP_STC_EN || STC_WRITE_EN) &&
277     *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
278     *         3DSTATE_WM::ForceKillPix == ON ||
279     *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
280     *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
281     *         3DSTATE_PS_BLEND::AlphaTestEnable ||
282     *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
283     *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
284     */
285 
286    /* These are always true:
287     *    3DSTATE_WM::ForceThreadDispatch != 1 &&
288     *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
289     */
290 
291    /* We only enable the PMA fix if we know for certain that HiZ is enabled.
292     * If we don't know whether HiZ is enabled or not, we disable the PMA fix
293     * and there is no harm.
294     *
295     * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
296     * 3DSTATE_DEPTH_BUFFER::HIZ Enable
297     */
298    if (!gfx->hiz_enabled)
299       return false;
300 
301    /* We can't possibly know if HiZ is enabled without the depth attachment */
302    ASSERTED const struct anv_image_view *d_iview = gfx->depth_att.iview;
303    assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
304 
305    /* 3DSTATE_PS_EXTRA::PixelShaderValid */
306    struct anv_graphics_pipeline *pipeline =
307       anv_pipeline_to_graphics(gfx->base.pipeline);
308    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
309       return false;
310 
311    /* !(3DSTATE_WM::EDSC_Mode == 2) */
312    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
313    if (wm_prog_data->early_fragment_tests)
314       return false;
315 
316    /* We never use anv_pipeline for HiZ ops so this is trivially true:
317    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
318     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
319     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
320     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
321     */
322 
323    /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
324     * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
325     */
326    const bool stc_test_en = ds->stencil.test_enable;
327 
328    /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
329     * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
330     *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
331     */
332    const bool stc_write_en = ds->stencil.write_enable;
333 
334    /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
335    const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
336 
337    /* COMP_STC_EN || STC_WRITE_EN */
338    if (!(comp_stc_en || stc_write_en))
339       return false;
340 
341    /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
342     *  3DSTATE_WM::ForceKillPix == ON ||
343     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
344     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
345     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
346     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
347     * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
348     */
349    return pipeline->kill_pixel ||
350           pipeline->rp_has_ds_self_dep ||
351           has_ds_feedback_loop(dyn) ||
352           wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
353 }
354 
355 static inline bool
anv_rasterization_aa_mode(VkPolygonMode raster_mode,VkLineRasterizationModeKHR line_mode)356 anv_rasterization_aa_mode(VkPolygonMode raster_mode,
357                           VkLineRasterizationModeKHR line_mode)
358 {
359    if (raster_mode == VK_POLYGON_MODE_LINE &&
360        line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
361       return true;
362    return false;
363 }
364 
365 static inline VkLineRasterizationModeKHR
anv_line_rasterization_mode(VkLineRasterizationModeKHR line_mode,unsigned rasterization_samples)366 anv_line_rasterization_mode(VkLineRasterizationModeKHR line_mode,
367                             unsigned rasterization_samples)
368 {
369    if (line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR) {
370       if (rasterization_samples > 1) {
371          return VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR;
372       } else {
373          return VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
374       }
375    }
376    return line_mode;
377 }
378 
379 /** Returns the final polygon mode for rasterization
380  *
381  * This function takes into account polygon mode, primitive topology and the
382  * different shader stages which might generate their own type of primitives.
383  */
384 static inline VkPolygonMode
anv_raster_polygon_mode(const struct anv_graphics_pipeline * pipeline,VkPolygonMode polygon_mode,VkPrimitiveTopology primitive_topology)385 anv_raster_polygon_mode(const struct anv_graphics_pipeline *pipeline,
386                         VkPolygonMode polygon_mode,
387                         VkPrimitiveTopology primitive_topology)
388 {
389    if (anv_pipeline_is_mesh(pipeline)) {
390       switch (get_mesh_prog_data(pipeline)->primitive_type) {
391       case MESA_PRIM_POINTS:
392          return VK_POLYGON_MODE_POINT;
393       case MESA_PRIM_LINES:
394          return VK_POLYGON_MODE_LINE;
395       case MESA_PRIM_TRIANGLES:
396          return polygon_mode;
397       default:
398          unreachable("invalid primitive type for mesh");
399       }
400    } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
401       switch (get_gs_prog_data(pipeline)->output_topology) {
402       case _3DPRIM_POINTLIST:
403          return VK_POLYGON_MODE_POINT;
404 
405       case _3DPRIM_LINELIST:
406       case _3DPRIM_LINESTRIP:
407       case _3DPRIM_LINELOOP:
408          return VK_POLYGON_MODE_LINE;
409 
410       case _3DPRIM_TRILIST:
411       case _3DPRIM_TRIFAN:
412       case _3DPRIM_TRISTRIP:
413       case _3DPRIM_RECTLIST:
414       case _3DPRIM_QUADLIST:
415       case _3DPRIM_QUADSTRIP:
416       case _3DPRIM_POLYGON:
417          return polygon_mode;
418       }
419       unreachable("Unsupported GS output topology");
420    } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
421       switch (get_tes_prog_data(pipeline)->output_topology) {
422       case INTEL_TESS_OUTPUT_TOPOLOGY_POINT:
423          return VK_POLYGON_MODE_POINT;
424 
425       case INTEL_TESS_OUTPUT_TOPOLOGY_LINE:
426          return VK_POLYGON_MODE_LINE;
427 
428       case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW:
429       case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
430          return polygon_mode;
431       }
432       unreachable("Unsupported TCS output topology");
433    } else {
434       switch (primitive_topology) {
435       case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
436          return VK_POLYGON_MODE_POINT;
437 
438       case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
439       case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
440       case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
441       case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
442          return VK_POLYGON_MODE_LINE;
443 
444       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
445       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
446       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
447       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
448       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
449          return polygon_mode;
450 
451       default:
452          unreachable("Unsupported primitive topology");
453       }
454    }
455 }
456 
457 static inline bool
anv_is_dual_src_blend_factor(VkBlendFactor factor)458 anv_is_dual_src_blend_factor(VkBlendFactor factor)
459 {
460    return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
461           factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
462           factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
463           factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
464 }
465 
466 static inline bool
anv_is_dual_src_blend_equation(const struct vk_color_blend_attachment_state * cb)467 anv_is_dual_src_blend_equation(const struct vk_color_blend_attachment_state *cb)
468 {
469    return anv_is_dual_src_blend_factor(cb->src_color_blend_factor) &&
470           anv_is_dual_src_blend_factor(cb->dst_color_blend_factor) &&
471           anv_is_dual_src_blend_factor(cb->src_alpha_blend_factor) &&
472           anv_is_dual_src_blend_factor(cb->dst_alpha_blend_factor);
473 }
474 
475 static void
anv_rasterization_mode(VkPolygonMode raster_mode,VkLineRasterizationModeKHR line_mode,float line_width,uint32_t * api_mode,bool * msaa_rasterization_enable)476 anv_rasterization_mode(VkPolygonMode raster_mode,
477                        VkLineRasterizationModeKHR line_mode,
478                        float line_width,
479                        uint32_t *api_mode,
480                        bool *msaa_rasterization_enable)
481 {
482    if (raster_mode == VK_POLYGON_MODE_LINE) {
483       /* Unfortunately, configuring our line rasterization hardware on gfx8
484        * and later is rather painful.  Instead of giving us bits to tell the
485        * hardware what line mode to use like we had on gfx7, we now have an
486        * arcane combination of API Mode and MSAA enable bits which do things
487        * in a table which are expected to magically put the hardware into the
488        * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
489        * hardware people thought of so nothing works the way you want it to.
490        *
491        * Look at the table titled "Multisample Rasterization Modes" in Vol 7
492        * of the Skylake PRM for more details.
493        */
494       switch (line_mode) {
495       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
496          *api_mode = DX101;
497 #if GFX_VER <= 9
498          /* Prior to ICL, the algorithm the HW uses to draw wide lines
499           * doesn't quite match what the CTS expects, at least for rectangular
500           * lines, so we set this to false here, making it draw parallelograms
501           * instead, which work well enough.
502           */
503          *msaa_rasterization_enable = line_width < 1.0078125;
504 #else
505          *msaa_rasterization_enable = true;
506 #endif
507          break;
508 
509       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
510       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
511          *api_mode = DX9OGL;
512          *msaa_rasterization_enable = false;
513          break;
514 
515       default:
516          unreachable("Unsupported line rasterization mode");
517       }
518    } else {
519       *api_mode = DX101;
520       *msaa_rasterization_enable = true;
521    }
522 }
523 
524 static bool
525 is_src1_blend_factor(enum GENX(3D_Color_Buffer_Blend_Factor) factor)
526 {
527    return factor == BLENDFACTOR_SRC1_COLOR ||
528           factor == BLENDFACTOR_SRC1_ALPHA ||
529           factor == BLENDFACTOR_INV_SRC1_COLOR ||
530           factor == BLENDFACTOR_INV_SRC1_ALPHA;
531 }
532 
533 #if GFX_VERx10 == 125
534 /**
535  * Return the dimensions of the current rendering area, defined as the
536  * bounding box of all present color, depth and stencil attachments.
537  */
538 UNUSED static bool
calculate_render_area(const struct anv_cmd_graphics_state * gfx,unsigned * width,unsigned * height)539 calculate_render_area(const struct anv_cmd_graphics_state *gfx,
540                       unsigned *width, unsigned *height)
541 {
542    *width = gfx->render_area.offset.x + gfx->render_area.extent.width;
543    *height = gfx->render_area.offset.y + gfx->render_area.extent.height;
544 
545    for (unsigned i = 0; i < gfx->color_att_count; i++) {
546       const struct anv_attachment *att = &gfx->color_att[i];
547       if (att->iview) {
548          *width = MAX2(*width, att->iview->vk.extent.width);
549          *height = MAX2(*height, att->iview->vk.extent.height);
550       }
551    }
552 
553    const struct anv_image_view *const z_view = gfx->depth_att.iview;
554    if (z_view) {
555       *width = MAX2(*width, z_view->vk.extent.width);
556       *height = MAX2(*height, z_view->vk.extent.height);
557    }
558 
559    const struct anv_image_view *const s_view = gfx->stencil_att.iview;
560    if (s_view) {
561       *width = MAX2(*width, s_view->vk.extent.width);
562       *height = MAX2(*height, s_view->vk.extent.height);
563    }
564 
565    return *width && *height;
566 }
567 
568 /* Calculate TBIMR tiling parameters adequate for the current pipeline
569  * setup.  Return true if TBIMR should be enabled.
570  */
571 UNUSED static bool
calculate_tile_dimensions(const struct anv_device * device,const struct anv_cmd_graphics_state * gfx,const struct intel_l3_config * l3_config,unsigned fb_width,unsigned fb_height,unsigned * tile_width,unsigned * tile_height)572 calculate_tile_dimensions(const struct anv_device *device,
573                           const struct anv_cmd_graphics_state *gfx,
574                           const struct intel_l3_config *l3_config,
575                           unsigned fb_width, unsigned fb_height,
576                           unsigned *tile_width, unsigned *tile_height)
577 {
578    assert(GFX_VER == 12);
579    const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
580 
581    unsigned pixel_size = 0;
582 
583    /* Perform a rough calculation of the tile cache footprint of the
584     * pixel pipeline, approximating it as the sum of the amount of
585     * memory used per pixel by every render target, depth, stencil and
586     * auxiliary surfaces bound to the pipeline.
587     */
588    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
589       const struct anv_attachment *att = &gfx->color_att[i];
590 
591       if (att->iview) {
592          const struct anv_image *image = att->iview->image;
593          const unsigned p = anv_image_aspect_to_plane(image,
594                                                       VK_IMAGE_ASPECT_COLOR_BIT);
595          const struct anv_image_plane *plane = &image->planes[p];
596 
597          pixel_size += intel_calculate_surface_pixel_size(
598             &plane->primary_surface.isl);
599 
600          if (isl_aux_usage_has_mcs(att->aux_usage))
601             pixel_size += intel_calculate_surface_pixel_size(
602                &plane->aux_surface.isl);
603 
604          if (isl_aux_usage_has_ccs(att->aux_usage))
605             pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
606                                           &plane->primary_surface.isl),
607                                        aux_scale);
608       }
609    }
610 
611    const struct anv_image_view *const z_view = gfx->depth_att.iview;
612    if (z_view) {
613       const struct anv_image *image = z_view->image;
614       assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
615       const unsigned p = anv_image_aspect_to_plane(image,
616                                                    VK_IMAGE_ASPECT_DEPTH_BIT);
617       const struct anv_image_plane *plane = &image->planes[p];
618 
619       pixel_size += intel_calculate_surface_pixel_size(
620          &plane->primary_surface.isl);
621 
622       if (isl_aux_usage_has_hiz(image->planes[p].aux_usage))
623          pixel_size += intel_calculate_surface_pixel_size(
624             &plane->aux_surface.isl);
625 
626       if (isl_aux_usage_has_ccs(image->planes[p].aux_usage))
627          pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
628                                        &plane->primary_surface.isl),
629                                     aux_scale);
630    }
631 
632    const struct anv_image_view *const s_view = gfx->depth_att.iview;
633    if (s_view && s_view != z_view) {
634       const struct anv_image *image = s_view->image;
635       assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
636       const unsigned p = anv_image_aspect_to_plane(image,
637                                                    VK_IMAGE_ASPECT_STENCIL_BIT);
638       const struct anv_image_plane *plane = &image->planes[p];
639 
640       pixel_size += intel_calculate_surface_pixel_size(
641          &plane->primary_surface.isl);
642    }
643 
644    if (!pixel_size)
645       return false;
646 
647    /* Compute a tile layout that allows reasonable utilization of the
648     * tile cache based on the per-pixel cache footprint estimated
649     * above.
650     */
651    intel_calculate_tile_dimensions(device->info, l3_config,
652                                    32, 32, fb_width, fb_height,
653                                    pixel_size, tile_width, tile_height);
654 
655    /* Perform TBIMR tile passes only if the framebuffer covers more
656     * than a single tile.
657     */
658    return *tile_width < fb_width || *tile_height < fb_height;
659 }
660 #endif
661 
662 #define GET(field) hw_state->field
663 #define SET(bit, field, value)                               \
664    do {                                                      \
665       __typeof(hw_state->field) __v = value;                 \
666       if (hw_state->field != __v) {                          \
667          hw_state->field = __v;                              \
668          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);   \
669       }                                                      \
670    } while (0)
671 #define SET_STAGE(bit, field, value, stage)                  \
672    do {                                                      \
673       __typeof(hw_state->field) __v = value;                 \
674       if (!anv_pipeline_has_stage(pipeline,                  \
675                                   MESA_SHADER_##stage)) {    \
676          hw_state->field = __v;                              \
677          break;                                              \
678       }                                                      \
679       if (hw_state->field != __v) {                          \
680          hw_state->field = __v;                              \
681          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);   \
682       }                                                      \
683    } while (0)
684 #define SETUP_PROVOKING_VERTEX(bit, cmd, mode)                         \
685    switch (mode) {                                                     \
686    case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:                     \
687       SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0);         \
688       SET(bit, cmd.LineStripListProvokingVertexSelect,     0);         \
689       SET(bit, cmd.TriangleFanProvokingVertexSelect,       1);         \
690       break;                                                           \
691    case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:                      \
692       SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2);         \
693       SET(bit, cmd.LineStripListProvokingVertexSelect,     1);         \
694       SET(bit, cmd.TriangleFanProvokingVertexSelect,       2);         \
695       break;                                                           \
696    default:                                                            \
697       unreachable("Invalid provoking vertex mode");                    \
698    }                                                                   \
699 
700 ALWAYS_INLINE static void
update_fs_msaa_flags(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_graphics_pipeline * pipeline)701 update_fs_msaa_flags(struct anv_gfx_dynamic_state *hw_state,
702                      const struct vk_dynamic_graphics_state *dyn,
703                      const struct anv_graphics_pipeline *pipeline)
704 {
705    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
706 
707    if (!wm_prog_data)
708       return;
709 
710    /* If we have any dynamic bits here, we might need to update the value
711     * in the push constant for the shader.
712     */
713    if (wm_prog_data->coarse_pixel_dispatch != INTEL_SOMETIMES &&
714        wm_prog_data->persample_dispatch != INTEL_SOMETIMES &&
715        wm_prog_data->alpha_to_coverage != INTEL_SOMETIMES)
716       return;
717 
718    enum intel_msaa_flags fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC;
719 
720    if (dyn->ms.rasterization_samples > 1) {
721       fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO;
722 
723       if (wm_prog_data->sample_shading) {
724          assert(wm_prog_data->persample_dispatch != INTEL_NEVER);
725          fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH;
726       }
727       if ((pipeline->sample_shading_enable &&
728            (pipeline->min_sample_shading * dyn->ms.rasterization_samples) > 1) ||
729           wm_prog_data->sample_shading) {
730          fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH |
731                           INTEL_MSAA_FLAG_PERSAMPLE_INTERP;
732       }
733    }
734 
735    if (wm_prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES &&
736        !(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)) {
737       fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG |
738                        INTEL_MSAA_FLAG_COARSE_RT_WRITES;
739    }
740 
741    if (dyn->ms.alpha_to_coverage_enable)
742       fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE;
743 
744    SET(FS_MSAA_FLAGS, fs_msaa_flags, fs_msaa_flags);
745 }
746 
747 ALWAYS_INLINE static void
update_ps(struct anv_gfx_dynamic_state * hw_state,const struct anv_device * device,const struct vk_dynamic_graphics_state * dyn,const struct anv_graphics_pipeline * pipeline)748 update_ps(struct anv_gfx_dynamic_state *hw_state,
749           const struct anv_device *device,
750           const struct vk_dynamic_graphics_state *dyn,
751           const struct anv_graphics_pipeline *pipeline)
752 {
753    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
754 
755    if (!wm_prog_data) {
756 #if GFX_VER < 20
757       SET(PS, ps._8PixelDispatchEnable,  false);
758       SET(PS, ps._16PixelDispatchEnable, false);
759       SET(PS, ps._32PixelDispatchEnable, false);
760 #else
761       SET(PS, ps.Kernel0Enable, false);
762       SET(PS, ps.Kernel1Enable, false);
763 #endif
764       return;
765    }
766 
767    const struct anv_shader_bin *fs_bin =
768       pipeline->base.shaders[MESA_SHADER_FRAGMENT];
769    struct GENX(3DSTATE_PS) ps = {};
770    intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
771                                MAX2(dyn->ms.rasterization_samples, 1),
772                                hw_state->fs_msaa_flags);
773 
774    SET(PS, ps.KernelStartPointer0,
775            fs_bin->kernel.offset +
776            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
777    SET(PS, ps.KernelStartPointer1,
778            fs_bin->kernel.offset +
779            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
780 #if GFX_VER < 20
781    SET(PS, ps.KernelStartPointer2,
782            fs_bin->kernel.offset +
783            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
784 #endif
785 
786    SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData0,
787            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0));
788    SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData1,
789            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1));
790 #if GFX_VER < 20
791    SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData2,
792            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2));
793 #endif
794 
795 #if GFX_VER < 20
796    SET(PS, ps._8PixelDispatchEnable,  ps._8PixelDispatchEnable);
797    SET(PS, ps._16PixelDispatchEnable, ps._16PixelDispatchEnable);
798    SET(PS, ps._32PixelDispatchEnable, ps._32PixelDispatchEnable);
799 #else
800    SET(PS, ps.Kernel0Enable,            ps.Kernel0Enable);
801    SET(PS, ps.Kernel1Enable,            ps.Kernel1Enable);
802    SET(PS, ps.Kernel0SIMDWidth,         ps.Kernel0SIMDWidth);
803    SET(PS, ps.Kernel1SIMDWidth,         ps.Kernel1SIMDWidth);
804    SET(PS, ps.Kernel0PolyPackingPolicy, ps.Kernel0PolyPackingPolicy);
805 #endif
806 
807    SET(PS, ps.PositionXYOffsetSelect,
808            !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
809            brw_wm_prog_data_is_persample(wm_prog_data,
810                                          hw_state->fs_msaa_flags) ?
811            POSOFFSET_SAMPLE : POSOFFSET_CENTROID);
812 }
813 
814 ALWAYS_INLINE static void
update_ps_extra_wm(struct anv_gfx_dynamic_state * hw_state,const struct anv_graphics_pipeline * pipeline)815 update_ps_extra_wm(struct anv_gfx_dynamic_state *hw_state,
816                    const struct anv_graphics_pipeline *pipeline)
817 {
818    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
819 
820    if (!wm_prog_data)
821       return;
822 
823    SET(PS_EXTRA, ps_extra.PixelShaderIsPerSample,
824                  brw_wm_prog_data_is_persample(wm_prog_data,
825                                                hw_state->fs_msaa_flags));
826 #if GFX_VER >= 11
827    const bool uses_coarse_pixel =
828       brw_wm_prog_data_is_coarse(wm_prog_data, hw_state->fs_msaa_flags);
829    SET(PS_EXTRA, ps_extra.PixelShaderIsPerCoarsePixel, uses_coarse_pixel);
830 #endif
831 #if GFX_VERx10 >= 125
832    /* TODO: We should only require this when the last geometry shader uses a
833     *       fragment shading rate that is not constant.
834     */
835    SET(PS_EXTRA, ps_extra.EnablePSDependencyOnCPsizeChange, uses_coarse_pixel);
836 #endif
837 
838    SET(WM, wm.BarycentricInterpolationMode,
839            wm_prog_data_barycentric_modes(wm_prog_data, hw_state->fs_msaa_flags));
840 }
841 
842 ALWAYS_INLINE static void
update_ps_extra_has_uav(struct anv_gfx_dynamic_state * hw_state,const struct anv_cmd_graphics_state * gfx,const struct anv_graphics_pipeline * pipeline)843 update_ps_extra_has_uav(struct anv_gfx_dynamic_state *hw_state,
844                         const struct anv_cmd_graphics_state *gfx,
845                         const struct anv_graphics_pipeline *pipeline)
846 {
847    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
848 
849 #if GFX_VERx10 >= 125
850    SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV,
851                        wm_prog_data && wm_prog_data->has_side_effects,
852                        FRAGMENT);
853 #else
854    /* Prior to Gfx12.5 the HW seems to avoid spawning fragment shaders even if
855     * 3DSTATE_PS_EXTRA::PixelShaderKillsPixel=true when
856     * 3DSTATE_PS_BLEND::HasWriteableRT=false. This is causing problems with
857     * occlusion queries with 0 attachments. There are no CTS tests exercising
858     * this but zink+anv fails a bunch of tests like piglit
859     * arb_framebuffer_no_attachments-query.
860     *
861     * Here we choose to tweak the PixelShaderHasUAV to make sure the fragment
862     * shaders are run properly.
863     */
864    SET_STAGE(PS_EXTRA, ps_extra.PixelShaderHasUAV,
865                        wm_prog_data && (wm_prog_data->has_side_effects ||
866                                         (gfx->color_att_count == 0 &&
867                                          gfx->n_occlusion_queries > 0)),
868                        FRAGMENT);
869 #endif
870 }
871 
872 ALWAYS_INLINE static void
update_ps_extra_kills_pixel(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx,const struct anv_graphics_pipeline * pipeline)873 update_ps_extra_kills_pixel(struct anv_gfx_dynamic_state *hw_state,
874                             const struct vk_dynamic_graphics_state *dyn,
875                             const struct anv_cmd_graphics_state *gfx,
876                             const struct anv_graphics_pipeline *pipeline)
877 {
878    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
879 
880    SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
881                        wm_prog_data && (pipeline->rp_has_ds_self_dep ||
882                                         has_ds_feedback_loop(dyn) ||
883                                         wm_prog_data->uses_kill),
884                        FRAGMENT);
885 }
886 
887 #if GFX_VERx10 >= 125
888 ALWAYS_INLINE static void
update_vfg_list_cut_index(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)889 update_vfg_list_cut_index(struct anv_gfx_dynamic_state *hw_state,
890                           const struct vk_dynamic_graphics_state *dyn)
891 {
892    SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable);
893 }
894 #endif
895 
896 ALWAYS_INLINE static void
update_streamout(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx,const struct anv_graphics_pipeline * pipeline)897 update_streamout(struct anv_gfx_dynamic_state *hw_state,
898                  const struct vk_dynamic_graphics_state *dyn,
899                  const struct anv_cmd_graphics_state *gfx,
900                  const struct anv_graphics_pipeline *pipeline)
901 {
902    SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable);
903    SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream);
904 
905 #if INTEL_NEEDS_WA_18022508906
906    /* Wa_18022508906 :
907     *
908     * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
909     *
910     * SOL_INT::Render_Enable =
911     *   (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
912     *   (
913     *     (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
914     *     !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
915     *     !3DSTATE_STREAMOUT::API_Render_Disable &&
916     *     (
917     *       3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
918     *       3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
919     *       3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
920     *       3DSTATE_PS_EXTRA::PS_Valid ||
921     *       3DSTATE_WM::Legacy Depth_Buffer_Clear ||
922     *       3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
923     *       3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
924     *     )
925     *   )
926     *
927     * If SOL_INT::Render_Enable is false, the SO stage will not forward any
928     * topologies down the pipeline. Which is not what we want for occlusion
929     * queries.
930     *
931     * Here we force rendering to get SOL_INT::Render_Enable when occlusion
932     * queries are active.
933     */
934    SET(STREAMOUT, so.ForceRendering,
935        (!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ?
936        Force_on : 0);
937 #endif
938 }
939 
940 ALWAYS_INLINE static void
update_provoking_vertex(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_graphics_pipeline * pipeline)941 update_provoking_vertex(struct anv_gfx_dynamic_state *hw_state,
942                         const struct vk_dynamic_graphics_state *dyn,
943                         const struct anv_graphics_pipeline *pipeline)
944 {
945    SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
946    SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
947 
948    switch (dyn->rs.provoking_vertex) {
949    case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
950       SET(STREAMOUT, so.ReorderMode, LEADING);
951       SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY);
952       break;
953 
954    case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
955       SET(STREAMOUT, so.ReorderMode, TRAILING);
956       SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY);
957       break;
958 
959    default:
960       unreachable("Invalid provoking vertex mode");
961    }
962 }
963 
964 ALWAYS_INLINE static void
update_topology(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_graphics_pipeline * pipeline)965 update_topology(struct anv_gfx_dynamic_state *hw_state,
966                 const struct vk_dynamic_graphics_state *dyn,
967                 const struct anv_graphics_pipeline *pipeline)
968 {
969    uint32_t topology =
970       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ?
971       _3DPRIM_PATCHLIST(dyn->ts.patch_control_points) :
972       vk_to_intel_primitive_type[dyn->ia.primitive_topology];
973 
974    SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology);
975 }
976 
977 #if GFX_VER >= 11
978 ALWAYS_INLINE static void
update_cps(struct anv_gfx_dynamic_state * hw_state,const struct anv_device * device,const struct vk_dynamic_graphics_state * dyn,const struct anv_graphics_pipeline * pipeline)979 update_cps(struct anv_gfx_dynamic_state *hw_state,
980            const struct anv_device *device,
981            const struct vk_dynamic_graphics_state *dyn,
982            const struct anv_graphics_pipeline *pipeline)
983 {
984    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
985    if (!wm_prog_data)
986       return;
987 
988    UNUSED const bool cps_enable =
989       brw_wm_prog_data_is_coarse(wm_prog_data, hw_state->fs_msaa_flags);
990 
991 #if GFX_VER >= 30
992    SET(COARSE_PIXEL, coarse_pixel.CPSizeX,
993        get_cps_size(dyn->fsr.fragment_size.width));
994    SET(COARSE_PIXEL, coarse_pixel.CPSizeY,
995        get_cps_size(dyn->fsr.fragment_size.height));
996    SET(COARSE_PIXEL, coarse_pixel.CPSizeCombiner0Opcode,
997        vk_to_intel_shading_rate_combiner_op[dyn->fsr.combiner_ops[0]]);
998    SET(COARSE_PIXEL, coarse_pixel.CPSizeCombiner1Opcode,
999        vk_to_intel_shading_rate_combiner_op[dyn->fsr.combiner_ops[1]]);
1000 #elif GFX_VER >= 12
1001    SET(CPS, cps.CoarsePixelShadingStateArrayPointer,
1002        get_cps_state_offset(device, cps_enable, &dyn->fsr));
1003 #else
1004    STATIC_ASSERT(GFX_VER == 11);
1005    SET(CPS, cps.CoarsePixelShadingMode,
1006             cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE);
1007    SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width);
1008    SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height);
1009 #endif
1010 }
1011 #endif
1012 
1013 ALWAYS_INLINE static void
update_te(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_graphics_pipeline * pipeline)1014 update_te(struct anv_gfx_dynamic_state *hw_state,
1015           const struct vk_dynamic_graphics_state *dyn,
1016           const struct anv_graphics_pipeline *pipeline)
1017 {
1018    const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1019 
1020    if (tes_prog_data && anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1021       if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
1022          SET(TE, te.OutputTopology, tes_prog_data->output_topology);
1023       } else {
1024             /* When the origin is upper-left, we have to flip the winding order */
1025          if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
1026             SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
1027          } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
1028             SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
1029          } else {
1030             SET(TE, te.OutputTopology, tes_prog_data->output_topology);
1031             }
1032       }
1033    } else {
1034       SET(TE, te.OutputTopology, OUTPUT_POINT);
1035    }
1036 }
1037 
1038 ALWAYS_INLINE static void
update_line_width(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)1039 update_line_width(struct anv_gfx_dynamic_state *hw_state,
1040                   const struct vk_dynamic_graphics_state *dyn)
1041 {
1042    SET(SF, sf.LineWidth, dyn->rs.line.width);
1043 }
1044 
1045 ALWAYS_INLINE static void
update_sf_global_depth_bias(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)1046 update_sf_global_depth_bias(struct anv_gfx_dynamic_state *hw_state,
1047                             const struct vk_dynamic_graphics_state *dyn)
1048 {
1049    /**
1050     * From the Vulkan Spec:
1051     *
1052     *    "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth bias
1053     *     representation is a factor of constant r equal to 1."
1054     *
1055     * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset:
1056     *
1057     *    "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer):
1058     *
1059     *     Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope
1060     *
1061     *     Where r is the minimum representable value > 0 in the depth buffer
1062     *     format, converted to float32 (note: If state bit Legacy Global Depth
1063     *     Bias Enable is set, the r term will be forced to 1.0)"
1064     *
1065     * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable
1066     * LegacyGlobalDepthBiasEnable.
1067     */
1068    SET(SF, sf.LegacyGlobalDepthBiasEnable,
1069            dyn->rs.depth_bias.representation ==
1070            VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT);
1071 }
1072 
1073 ALWAYS_INLINE static void
update_clip_api_mode(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)1074 update_clip_api_mode(struct anv_gfx_dynamic_state *hw_state,
1075                      const struct vk_dynamic_graphics_state *dyn)
1076 {
1077    SET(CLIP, clip.APIMode,
1078              dyn->vp.depth_clip_negative_one_to_one ?
1079              APIMODE_OGL : APIMODE_D3D);
1080 }
1081 
1082 ALWAYS_INLINE static void
update_clip_max_viewport(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)1083 update_clip_max_viewport(struct anv_gfx_dynamic_state *hw_state,
1084                          const struct vk_dynamic_graphics_state *dyn)
1085 {
1086    /* From the Vulkan 1.0.45 spec:
1087     *
1088     *    "If the last active vertex processing stage shader entry point's
1089     *     interface does not include a variable decorated with ViewportIndex,
1090     *     then the first viewport is used."
1091     *
1092     * This could mean that we might need to set the MaximumVPIndex based on
1093     * the pipeline's last stage, but if the last shader doesn't write the
1094     * viewport index and the VUE header is used, the compiler will force the
1095     * value to 0 (which is what the spec requires above). Otherwise it seems
1096     * like the HW should be pulling 0 if the VUE header is not present.
1097     *
1098     * Avoiding a check on the pipeline seems to prevent additional emissions
1099     * of 3DSTATE_CLIP which appear to impact performance on Assassin's Creed
1100     * Valhalla..
1101     */
1102    SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ?
1103                                   dyn->vp.viewport_count - 1 : 0);
1104 }
1105 
1106 ALWAYS_INLINE static void
update_clip_raster(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx,const struct anv_graphics_pipeline * pipeline)1107 update_clip_raster(struct anv_gfx_dynamic_state *hw_state,
1108                    const struct vk_dynamic_graphics_state *dyn,
1109                    const struct anv_cmd_graphics_state *gfx,
1110                    const struct anv_graphics_pipeline *pipeline)
1111 {
1112    /* Take dynamic primitive topology in to account with
1113     *    3DSTATE_RASTER::APIMode
1114     *    3DSTATE_RASTER::DXMultisampleRasterizationEnable
1115     *    3DSTATE_RASTER::AntialiasingEnable
1116     */
1117    uint32_t api_mode = 0;
1118    bool msaa_raster_enable = false;
1119 
1120    const VkLineRasterizationModeKHR line_mode =
1121       anv_line_rasterization_mode(dyn->rs.line.mode,
1122                                   dyn->ms.rasterization_samples);
1123 
1124    const VkPolygonMode dynamic_raster_mode =
1125       anv_raster_polygon_mode(pipeline,
1126                               dyn->rs.polygon_mode,
1127                               dyn->ia.primitive_topology);
1128 
1129    anv_rasterization_mode(dynamic_raster_mode,
1130                           line_mode, dyn->rs.line.width,
1131                           &api_mode, &msaa_raster_enable);
1132 
1133    /* From the Browadwell PRM, Volume 2, documentation for 3DSTATE_RASTER,
1134     * "Antialiasing Enable":
1135     *
1136     * "This field must be disabled if any of the render targets have integer
1137     * (UINT or SINT) surface format."
1138     *
1139     * Additionally internal documentation for Gfx12+ states:
1140     *
1141     * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
1142     *  FORCED_SAMPLE_COUNT > 1."
1143     */
1144    const bool aa_enable =
1145       anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) &&
1146       !gfx->has_uint_rt &&
1147       !(GFX_VER >= 12 && gfx->samples > 1);
1148 
1149    const bool depth_clip_enable =
1150       vk_rasterization_state_depth_clip_enable(&dyn->rs);
1151 
1152    const bool xy_clip_test_enable =
1153       (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
1154 
1155    SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable);
1156 
1157    SET(RASTER, raster.APIMode, api_mode);
1158    SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable);
1159    SET(RASTER, raster.AntialiasingEnable, aa_enable);
1160    SET(RASTER, raster.CullMode, vk_to_intel_cullmode[dyn->rs.cull_mode]);
1161    SET(RASTER, raster.FrontWinding, vk_to_intel_front_face[dyn->rs.front_face]);
1162    SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable);
1163    SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable);
1164    SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable);
1165    SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant_factor);
1166    SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope_factor);
1167    SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp);
1168    SET(RASTER, raster.FrontFaceFillMode, vk_to_intel_fillmode[dyn->rs.polygon_mode]);
1169    SET(RASTER, raster.BackFaceFillMode, vk_to_intel_fillmode[dyn->rs.polygon_mode]);
1170    SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable);
1171    SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable);
1172    SET(RASTER, raster.ConservativeRasterizationEnable,
1173                dyn->rs.conservative_mode !=
1174                VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
1175 }
1176 
1177 ALWAYS_INLINE static void
update_multisample(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)1178 update_multisample(struct anv_gfx_dynamic_state *hw_state,
1179                    const struct vk_dynamic_graphics_state *dyn)
1180 {
1181    SET(MULTISAMPLE, ms.NumberofMultisamples,
1182                     __builtin_ffs(MAX2(dyn->ms.rasterization_samples, 1)) - 1);
1183 }
1184 
1185 ALWAYS_INLINE static void
update_sample_mask(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)1186 update_sample_mask(struct anv_gfx_dynamic_state *hw_state,
1187                    const struct vk_dynamic_graphics_state *dyn)
1188 {
1189    /* From the Vulkan 1.0 spec:
1190     *    If pSampleMask is NULL, it is treated as if the mask has all bits
1191     *    enabled, i.e. no coverage is removed from fragments.
1192     *
1193     * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
1194     */
1195    SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff);
1196 }
1197 
1198 ALWAYS_INLINE static void
update_wm_depth_stencil(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx,const struct anv_device * device)1199 update_wm_depth_stencil(struct anv_gfx_dynamic_state *hw_state,
1200                         const struct vk_dynamic_graphics_state *dyn,
1201                         const struct anv_cmd_graphics_state *gfx,
1202                         const struct anv_device *device)
1203 {
1204    VkImageAspectFlags ds_aspects = 0;
1205    if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED)
1206       ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
1207    if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED)
1208       ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
1209 
1210    struct vk_depth_stencil_state opt_ds = dyn->ds;
1211    vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
1212 
1213    SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true);
1214 
1215    SET(WM_DEPTH_STENCIL, ds.StencilTestMask,
1216        opt_ds.stencil.front.compare_mask & 0xff);
1217    SET(WM_DEPTH_STENCIL, ds.StencilWriteMask,
1218        opt_ds.stencil.front.write_mask & 0xff);
1219 
1220    SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
1221    SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);
1222 
1223    SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue,
1224        opt_ds.stencil.front.reference & 0xff);
1225    SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue,
1226        opt_ds.stencil.back.reference & 0xff);
1227 
1228    SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable);
1229    SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
1230    SET(WM_DEPTH_STENCIL, ds.DepthTestFunction,
1231                          vk_to_intel_compare_op[opt_ds.depth.compare_op]);
1232    SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable);
1233    SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable,
1234                          opt_ds.stencil.write_enable);
1235    SET(WM_DEPTH_STENCIL, ds.StencilFailOp,
1236                          vk_to_intel_stencil_op[opt_ds.stencil.front.op.fail]);
1237    SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp,
1238                          vk_to_intel_stencil_op[opt_ds.stencil.front.op.pass]);
1239    SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp,
1240                          vk_to_intel_stencil_op[
1241                             opt_ds.stencil.front.op.depth_fail]);
1242    SET(WM_DEPTH_STENCIL, ds.StencilTestFunction,
1243                          vk_to_intel_compare_op[
1244                             opt_ds.stencil.front.op.compare]);
1245    SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp,
1246                          vk_to_intel_stencil_op[
1247                             opt_ds.stencil.back.op.fail]);
1248    SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp,
1249                          vk_to_intel_stencil_op[
1250                             opt_ds.stencil.back.op.pass]);
1251    SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp,
1252                          vk_to_intel_stencil_op[
1253                             opt_ds.stencil.back.op.depth_fail]);
1254    SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction,
1255                          vk_to_intel_compare_op[
1256                             opt_ds.stencil.back.op.compare]);
1257 
1258 #if GFX_VER == 9
1259    const bool pma = want_stencil_pma_fix(dyn, gfx, &opt_ds);
1260    SET(PMA_FIX, pma_fix, pma);
1261 #endif
1262 
1263 #if INTEL_WA_18019816803_GFX_VER
1264    if (intel_needs_workaround(device->info, 18019816803)) {
1265       bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable;
1266       SET(WA_18019816803, ds_write_state, ds_write_state);
1267    }
1268 #endif
1269 }
1270 
1271 ALWAYS_INLINE static void
update_depth_bounds(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)1272 update_depth_bounds(struct anv_gfx_dynamic_state *hw_state,
1273                     const struct vk_dynamic_graphics_state *dyn)
1274 {
1275    SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable);
1276    /* Only look at updating the bounds if testing is enabled */
1277    if (dyn->ds.depth.bounds_test.enable) {
1278       SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min);
1279       SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max);
1280    }
1281 }
1282 
1283 ALWAYS_INLINE static void
update_line_stipple(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn)1284 update_line_stipple(struct anv_gfx_dynamic_state *hw_state,
1285                     const struct vk_dynamic_graphics_state *dyn)
1286 {
1287    SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern);
1288    SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount,
1289                      1.0f / MAX2(1, dyn->rs.line.stipple.factor));
1290    SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor);
1291 
1292    SET(WM,           wm.LineStippleEnable, dyn->rs.line.stipple.enable);
1293 }
1294 
1295 ALWAYS_INLINE static void
update_vf_restart(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx)1296 update_vf_restart(struct anv_gfx_dynamic_state *hw_state,
1297                   const struct vk_dynamic_graphics_state *dyn,
1298                   const struct anv_cmd_graphics_state *gfx)
1299 {
1300    SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable);
1301    SET(VF, vf.CutIndex, gfx->restart_index);
1302 }
1303 
1304 ALWAYS_INLINE static void
update_blend_state(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,struct anv_cmd_graphics_state * gfx,const struct anv_device * device,bool has_fs_stage,bool has_fs_dual_src)1305 update_blend_state(struct anv_gfx_dynamic_state *hw_state,
1306                    const struct vk_dynamic_graphics_state *dyn,
1307                    struct anv_cmd_graphics_state *gfx,
1308                    const struct anv_device *device,
1309                    bool has_fs_stage,
1310                    bool has_fs_dual_src)
1311 {
1312    const struct anv_instance *instance = device->physical->instance;
1313    const uint8_t color_writes = dyn->cb.color_write_enables;
1314    bool has_writeable_rt =
1315       has_fs_stage &&
1316       !anv_gfx_all_color_write_masked(gfx, dyn);
1317 
1318    SET(BLEND_STATE, blend.AlphaToCoverageEnable,
1319                     dyn->ms.alpha_to_coverage_enable);
1320    SET(BLEND_STATE, blend.AlphaToOneEnable,
1321                     dyn->ms.alpha_to_one_enable);
1322    SET(BLEND_STATE, blend.ColorDitherEnable,
1323                     gfx->rendering_flags &
1324                     VK_RENDERING_ENABLE_LEGACY_DITHERING_BIT_EXT);
1325 
1326    bool independent_alpha_blend = false;
1327    /* Wa_14018912822, check if we set these during RT setup. */
1328    bool color_blend_zero = false;
1329    bool alpha_blend_zero = false;
1330    uint32_t rt_0 = MESA_VK_ATTACHMENT_UNUSED;
1331    for (uint32_t rt = 0; rt < MAX_RTS; rt++) {
1332       if (gfx->color_output_mapping[rt] >= gfx->color_att_count)
1333          continue;
1334 
1335       uint32_t att = gfx->color_output_mapping[rt];
1336       if (att == 0)
1337          rt_0 = att;
1338 
1339       /* Disable anything above the current number of color attachments. */
1340       bool write_disabled = (color_writes & BITFIELD_BIT(att)) == 0;
1341 
1342       SET(BLEND_STATE, blend.rts[rt].WriteDisableAlpha,
1343                        write_disabled ||
1344                        (dyn->cb.attachments[att].write_mask &
1345                         VK_COLOR_COMPONENT_A_BIT) == 0);
1346       SET(BLEND_STATE, blend.rts[rt].WriteDisableRed,
1347                        write_disabled ||
1348                        (dyn->cb.attachments[att].write_mask &
1349                         VK_COLOR_COMPONENT_R_BIT) == 0);
1350       SET(BLEND_STATE, blend.rts[rt].WriteDisableGreen,
1351                        write_disabled ||
1352                        (dyn->cb.attachments[att].write_mask &
1353                         VK_COLOR_COMPONENT_G_BIT) == 0);
1354       SET(BLEND_STATE, blend.rts[rt].WriteDisableBlue,
1355                        write_disabled ||
1356                        (dyn->cb.attachments[att].write_mask &
1357                         VK_COLOR_COMPONENT_B_BIT) == 0);
1358       /* Vulkan specification 1.2.168, VkLogicOp:
1359        *
1360        *   "Logical operations are controlled by the logicOpEnable and logicOp
1361        *   members of VkPipelineColorBlendStateCreateInfo. If logicOpEnable is
1362        *   VK_TRUE, then a logical operation selected by logicOp is applied
1363        *   between each color attachment and the fragment’s corresponding
1364        *   output value, and blending of all attachments is treated as if it
1365        *   were disabled."
1366        *
1367        * From the Broadwell PRM Volume 2d: Command Reference: Structures:
1368        * BLEND_STATE_ENTRY:
1369        *
1370        *   "Enabling LogicOp and Color Buffer Blending at the same time is
1371        *   UNDEFINED"
1372        */
1373       SET(BLEND_STATE, blend.rts[rt].LogicOpFunction,
1374                        vk_to_intel_logic_op[dyn->cb.logic_op]);
1375       SET(BLEND_STATE, blend.rts[rt].LogicOpEnable, dyn->cb.logic_op_enable);
1376 
1377       SET(BLEND_STATE, blend.rts[rt].ColorClampRange, COLORCLAMP_RTFORMAT);
1378       SET(BLEND_STATE, blend.rts[rt].PreBlendColorClampEnable, true);
1379       SET(BLEND_STATE, blend.rts[rt].PostBlendColorClampEnable, true);
1380 
1381       /* Setup blend equation. */
1382       SET(BLEND_STATE, blend.rts[rt].ColorBlendFunction,
1383                        vk_to_intel_blend_op[
1384                           dyn->cb.attachments[att].color_blend_op]);
1385       SET(BLEND_STATE, blend.rts[rt].AlphaBlendFunction,
1386                        vk_to_intel_blend_op[
1387                           dyn->cb.attachments[att].alpha_blend_op]);
1388 
1389       if (dyn->cb.attachments[att].src_color_blend_factor !=
1390           dyn->cb.attachments[att].src_alpha_blend_factor ||
1391           dyn->cb.attachments[att].dst_color_blend_factor !=
1392           dyn->cb.attachments[att].dst_alpha_blend_factor ||
1393           dyn->cb.attachments[att].color_blend_op !=
1394           dyn->cb.attachments[att].alpha_blend_op)
1395          independent_alpha_blend = true;
1396 
1397       /* The Dual Source Blending documentation says:
1398        *
1399        * "If SRC1 is included in a src/dst blend factor and a DualSource RT
1400        * Write message is not used, results are UNDEFINED. (This reflects the
1401        * same restriction in DX APIs, where undefined results are produced if
1402        * “o1” is not written by a PS – there are no default values defined)."
1403        *
1404        * There is no way to gracefully fix this undefined situation so we just
1405        * disable the blending to prevent possible issues.
1406        */
1407       if (has_fs_stage && !has_fs_dual_src &&
1408           anv_is_dual_src_blend_equation(&dyn->cb.attachments[att])) {
1409          SET(BLEND_STATE, blend.rts[rt].ColorBufferBlendEnable, false);
1410       } else {
1411          SET(BLEND_STATE, blend.rts[rt].ColorBufferBlendEnable,
1412                           !dyn->cb.logic_op_enable &&
1413                           dyn->cb.attachments[att].blend_enable);
1414       }
1415 
1416       /* Our hardware applies the blend factor prior to the blend function
1417        * regardless of what function is used. Technically, this means the
1418        * hardware can do MORE than GL or Vulkan specify. However, it also
1419        * means that, for MIN and MAX, we have to stomp the blend factor to ONE
1420        * to make it a no-op.
1421        */
1422       uint32_t SourceBlendFactor;
1423       uint32_t DestinationBlendFactor;
1424       uint32_t SourceAlphaBlendFactor;
1425       uint32_t DestinationAlphaBlendFactor;
1426       if (dyn->cb.attachments[att].color_blend_op == VK_BLEND_OP_MIN ||
1427           dyn->cb.attachments[att].color_blend_op == VK_BLEND_OP_MAX) {
1428          SourceBlendFactor = BLENDFACTOR_ONE;
1429          DestinationBlendFactor = BLENDFACTOR_ONE;
1430       } else {
1431          SourceBlendFactor = vk_to_intel_blend[
1432             dyn->cb.attachments[att].src_color_blend_factor];
1433          DestinationBlendFactor = vk_to_intel_blend[
1434             dyn->cb.attachments[att].dst_color_blend_factor];
1435       }
1436 
1437       if (dyn->cb.attachments[att].alpha_blend_op == VK_BLEND_OP_MIN ||
1438           dyn->cb.attachments[att].alpha_blend_op == VK_BLEND_OP_MAX) {
1439          SourceAlphaBlendFactor = BLENDFACTOR_ONE;
1440          DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
1441       } else {
1442          SourceAlphaBlendFactor = vk_to_intel_blend[
1443             dyn->cb.attachments[att].src_alpha_blend_factor];
1444          DestinationAlphaBlendFactor = vk_to_intel_blend[
1445             dyn->cb.attachments[att].dst_alpha_blend_factor];
1446       }
1447 
1448       /* Replace and Src1 value by 1.0 if dual source blending is not
1449        * enabled.
1450        */
1451       if (has_fs_stage && !has_fs_dual_src) {
1452          if (is_src1_blend_factor(SourceBlendFactor))
1453             SourceBlendFactor = BLENDFACTOR_ONE;
1454          if (is_src1_blend_factor(DestinationBlendFactor))
1455             DestinationBlendFactor = BLENDFACTOR_ONE;
1456       }
1457 
1458       if (instance->intel_enable_wa_14018912822 &&
1459           intel_needs_workaround(device->info, 14018912822) &&
1460           dyn->ms.rasterization_samples > 1) {
1461          if (DestinationBlendFactor == BLENDFACTOR_ZERO) {
1462             DestinationBlendFactor = BLENDFACTOR_CONST_COLOR;
1463             color_blend_zero = true;
1464          }
1465          if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) {
1466             DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA;
1467             alpha_blend_zero = true;
1468          }
1469       }
1470 
1471       SET(BLEND_STATE, blend.rts[rt].SourceBlendFactor, SourceBlendFactor);
1472       SET(BLEND_STATE, blend.rts[rt].DestinationBlendFactor, DestinationBlendFactor);
1473       SET(BLEND_STATE, blend.rts[rt].SourceAlphaBlendFactor, SourceAlphaBlendFactor);
1474       SET(BLEND_STATE, blend.rts[rt].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor);
1475    }
1476    gfx->color_blend_zero = color_blend_zero;
1477    gfx->alpha_blend_zero = alpha_blend_zero;
1478 
1479    SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend);
1480 
1481    if (rt_0 == MESA_VK_ATTACHMENT_UNUSED)
1482       rt_0 = 0;
1483 
1484    /* 3DSTATE_PS_BLEND to be consistent with the rest of the
1485     * BLEND_STATE_ENTRY.
1486     */
1487    SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt);
1488    SET(PS_BLEND, ps_blend.ColorBufferBlendEnable,
1489                  GET(blend.rts[rt_0].ColorBufferBlendEnable));
1490    SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor,
1491                  GET(blend.rts[rt_0].SourceAlphaBlendFactor));
1492    SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor,
1493                  gfx->alpha_blend_zero ?
1494                  BLENDFACTOR_CONST_ALPHA :
1495                  GET(blend.rts[rt_0].DestinationAlphaBlendFactor));
1496    SET(PS_BLEND, ps_blend.SourceBlendFactor,
1497                  GET(blend.rts[rt_0].SourceBlendFactor));
1498    SET(PS_BLEND, ps_blend.DestinationBlendFactor,
1499                  gfx->color_blend_zero ?
1500                  BLENDFACTOR_CONST_COLOR :
1501                  GET(blend.rts[rt_0].DestinationBlendFactor));
1502    SET(PS_BLEND, ps_blend.AlphaTestEnable, false);
1503    SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable,
1504                  GET(blend.IndependentAlphaBlendEnable));
1505    SET(PS_BLEND, ps_blend.AlphaToCoverageEnable,
1506                  dyn->ms.alpha_to_coverage_enable);
1507 }
1508 
1509 ALWAYS_INLINE static void
update_blend_constants(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx)1510 update_blend_constants(struct anv_gfx_dynamic_state *hw_state,
1511                        const struct vk_dynamic_graphics_state *dyn,
1512                        const struct anv_cmd_graphics_state *gfx)
1513 {
1514    SET(CC_STATE, cc.BlendConstantColorRed,
1515                  gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]);
1516    SET(CC_STATE, cc.BlendConstantColorGreen,
1517                  gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]);
1518    SET(CC_STATE, cc.BlendConstantColorBlue,
1519                  gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]);
1520    SET(CC_STATE, cc.BlendConstantColorAlpha,
1521                  gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]);
1522 }
1523 
1524 ALWAYS_INLINE static void
update_viewports(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx,const struct anv_device * device)1525 update_viewports(struct anv_gfx_dynamic_state *hw_state,
1526                  const struct vk_dynamic_graphics_state *dyn,
1527                  const struct anv_cmd_graphics_state *gfx,
1528                  const struct anv_device *device)
1529 {
1530    const struct anv_instance *instance = device->physical->instance;
1531    const VkViewport *viewports = dyn->vp.viewports;
1532 
1533    const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f;
1534 
1535       for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1536          const VkViewport *vp = &viewports[i];
1537 
1538          /* The gfx7 state struct has just the matrix and guardband fields, the
1539           * gfx8 struct adds the min/max viewport fields. */
1540          struct GENX(SF_CLIP_VIEWPORT) sfv = {
1541             .ViewportMatrixElementm00 = vp->width / 2,
1542             .ViewportMatrixElementm11 = vp->height / 2,
1543             .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
1544             .ViewportMatrixElementm30 = vp->x + vp->width / 2,
1545             .ViewportMatrixElementm31 = vp->y + vp->height / 2,
1546             .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ?
1547                (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
1548             .XMinClipGuardband = -1.0f,
1549             .XMaxClipGuardband = 1.0f,
1550             .YMinClipGuardband = -1.0f,
1551             .YMaxClipGuardband = 1.0f,
1552             .XMinViewPort = vp->x,
1553             .XMaxViewPort = vp->x + vp->width - 1,
1554             .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
1555             .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
1556          };
1557 
1558          /* Fix depth test misrenderings by lowering translated depth range */
1559          if (instance->lower_depth_range_rate != 1.0f)
1560             sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
1561 
1562          const uint32_t fb_size_max = 1 << 14;
1563          uint32_t x_min = 0, x_max = fb_size_max;
1564          uint32_t y_min = 0, y_max = fb_size_max;
1565 
1566          /* If we have a valid renderArea, include that */
1567          if (gfx->render_area.extent.width > 0 &&
1568              gfx->render_area.extent.height > 0) {
1569             x_min = MAX2(x_min, gfx->render_area.offset.x);
1570             x_max = MIN2(x_max, gfx->render_area.offset.x +
1571                                 gfx->render_area.extent.width);
1572             y_min = MAX2(y_min, gfx->render_area.offset.y);
1573             y_max = MIN2(y_max, gfx->render_area.offset.y +
1574                                 gfx->render_area.extent.height);
1575          }
1576 
1577          /* The client is required to have enough scissors for whatever it
1578           * sets as ViewportIndex but it's possible that they've got more
1579           * viewports set from a previous command. Also, from the Vulkan
1580           * 1.3.207:
1581           *
1582           *    "The application must ensure (using scissor if necessary) that
1583           *    all rendering is contained within the render area."
1584           *
1585           * If the client doesn't set a scissor, that basically means it
1586           * guarantees everything is in-bounds already. If we end up using a
1587           * guardband of [-1, 1] in that case, there shouldn't be much loss.
1588           * It's theoretically possible that they could do all their clipping
1589           * with clip planes but that'd be a bit odd.
1590           */
1591          if (i < dyn->vp.scissor_count) {
1592             const VkRect2D *scissor = &dyn->vp.scissors[i];
1593             x_min = MAX2(x_min, scissor->offset.x);
1594             x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width);
1595             y_min = MAX2(y_min, scissor->offset.y);
1596             y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height);
1597          }
1598 
1599          /* Only bother calculating the guardband if our known render area is
1600           * less than the maximum size. Otherwise, it will calculate [-1, 1]
1601           * anyway but possibly with precision loss.
1602           */
1603          if (x_min > 0 || x_max < fb_size_max ||
1604              y_min > 0 || y_max < fb_size_max) {
1605             intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
1606                                            sfv.ViewportMatrixElementm00,
1607                                            sfv.ViewportMatrixElementm11,
1608                                            sfv.ViewportMatrixElementm30,
1609                                            sfv.ViewportMatrixElementm31,
1610                                            &sfv.XMinClipGuardband,
1611                                            &sfv.XMaxClipGuardband,
1612                                            &sfv.YMinClipGuardband,
1613                                            &sfv.YMaxClipGuardband);
1614          }
1615 
1616 #define SET_VP(bit, state, field)                                        \
1617          do {                                                           \
1618             if (hw_state->state.field != sfv.field) {                   \
1619                hw_state->state.field = sfv.field;                       \
1620                BITSET_SET(hw_state->dirty,                              \
1621                           ANV_GFX_STATE_##bit);                         \
1622             }                                                           \
1623          } while (0)
1624          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00);
1625          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11);
1626          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22);
1627          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30);
1628          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31);
1629          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32);
1630          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband);
1631          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband);
1632          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband);
1633          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband);
1634          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort);
1635          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort);
1636          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort);
1637          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort);
1638 #undef SET_VP
1639 
1640          const bool depth_range_unrestricted =
1641             device->vk.enabled_extensions.EXT_depth_range_unrestricted;
1642 
1643          float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0;
1644          float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0;
1645 
1646          float min_depth = dyn->rs.depth_clamp_enable ?
1647                            MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit;
1648          float max_depth = dyn->rs.depth_clamp_enable ?
1649                            MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit;
1650 
1651          if (dyn->rs.depth_clamp_enable &&
1652             dyn->vp.depth_clamp_mode == VK_DEPTH_CLAMP_MODE_USER_DEFINED_RANGE_EXT) {
1653             min_depth = dyn->vp.depth_clamp_range.minDepthClamp;
1654             max_depth = dyn->vp.depth_clamp_range.maxDepthClamp;
1655          }
1656 
1657          SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth);
1658          SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth);
1659       }
1660 
1661       /* If the HW state is already considered dirty or the previous
1662        * programmed viewport count is smaller than what we need, update the
1663        * viewport count and ensure the HW state is dirty. Otherwise if the
1664        * number of viewport programmed previously was larger than what we need
1665        * now, no need to reemit we can just keep the old programmed values.
1666        */
1667       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
1668           hw_state->vp_sf_clip.count < dyn->vp.viewport_count) {
1669          hw_state->vp_sf_clip.count = dyn->vp.viewport_count;
1670          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
1671       }
1672       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
1673           hw_state->vp_cc.count < dyn->vp.viewport_count) {
1674          hw_state->vp_cc.count = dyn->vp.viewport_count;
1675          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
1676       }
1677 }
1678 
1679 ALWAYS_INLINE static void
update_scissors(struct anv_gfx_dynamic_state * hw_state,const struct vk_dynamic_graphics_state * dyn,const struct anv_cmd_graphics_state * gfx,VkCommandBufferLevel cmd_buffer_level)1680 update_scissors(struct anv_gfx_dynamic_state *hw_state,
1681                 const struct vk_dynamic_graphics_state *dyn,
1682                 const struct anv_cmd_graphics_state *gfx,
1683                 VkCommandBufferLevel cmd_buffer_level)
1684 {
1685    const VkRect2D *scissors = dyn->vp.scissors;
1686    const VkViewport *viewports = dyn->vp.viewports;
1687 
1688    for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) {
1689       const VkRect2D *s = &scissors[i];
1690       const VkViewport *vp = &viewports[i];
1691 
1692       const int max = 0xffff;
1693 
1694       uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
1695       uint32_t x_min = MAX2(s->offset.x, vp->x);
1696       int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
1697                            MAX2(vp->y, vp->y + vp->height) - 1);
1698       int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
1699                            vp->x + vp->width - 1);
1700 
1701       y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
1702       x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
1703 
1704       /* Do this math using int64_t so overflow gets clamped correctly. */
1705       if (cmd_buffer_level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1706          y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
1707          x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
1708          y_max = CLAMP((uint64_t) y_max, 0,
1709                        gfx->render_area.offset.y +
1710                        gfx->render_area.extent.height - 1);
1711          x_max = CLAMP((uint64_t) x_max, 0,
1712                        gfx->render_area.offset.x +
1713                        gfx->render_area.extent.width - 1);
1714       }
1715 
1716       if (s->extent.width <= 0 || s->extent.height <= 0) {
1717          /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
1718           * ymax < ymin for empty clips. In case clip x, y, width height are
1719           * all 0, the clamps below produce 0 for xmin, ymin, xmax, ymax,
1720           * which isn't what we want. Just special case empty clips and
1721           * produce a canonical empty clip.
1722           */
1723          SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1);
1724          SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1);
1725          SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0);
1726          SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0);
1727       } else {
1728          SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min);
1729          SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min);
1730          SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max);
1731          SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max);
1732       }
1733    }
1734 
1735    /* If the HW state is already considered dirty or the previous programmed
1736     * viewport count is smaller than what we need, update the viewport count
1737     * and ensure the HW state is dirty. Otherwise if the number of viewport
1738     * programmed previously was larger than what we need now, no need to
1739     * reemit we can just keep the old programmed values.
1740     */
1741    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR) ||
1742        hw_state->scissor.count < dyn->vp.scissor_count) {
1743       hw_state->scissor.count = dyn->vp.scissor_count;
1744       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR);
1745    }
1746 }
1747 
1748 #if GFX_VERx10 == 125
1749 ALWAYS_INLINE static void
update_tbimr_info(struct anv_gfx_dynamic_state * hw_state,const struct anv_device * device,const struct anv_cmd_graphics_state * gfx,const struct intel_l3_config * l3_config)1750 update_tbimr_info(struct anv_gfx_dynamic_state *hw_state,
1751                   const struct anv_device *device,
1752                   const struct anv_cmd_graphics_state *gfx,
1753                   const struct intel_l3_config *l3_config)
1754 {
1755    unsigned fb_width, fb_height, tile_width, tile_height;
1756 
1757    if (device->physical->instance->enable_tbimr &&
1758        calculate_render_area(gfx, &fb_width, &fb_height) &&
1759        calculate_tile_dimensions(device, gfx, l3_config,
1760                                  fb_width, fb_height,
1761                                  &tile_width, &tile_height)) {
1762       /* Use a batch size of 128 polygons per slice as recommended */
1763       /*    by BSpec 68436 "TBIMR Programming". */
1764       const unsigned num_slices = device->info->num_slices;
1765       const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
1766 
1767       SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height);
1768       SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width);
1769       SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount,
1770           DIV_ROUND_UP(fb_height, tile_height));
1771       SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount,
1772           DIV_ROUND_UP(fb_width, tile_width));
1773       SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize,
1774           util_logbase2(batch_size) - 5);
1775       SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true);
1776       SET(TBIMR_TILE_PASS_INFO, use_tbimr, true);
1777    } else {
1778       hw_state->use_tbimr = false;
1779    }
1780 }
1781 #endif
1782 
1783 /**
1784  * This function takes the vulkan runtime values & dirty states and updates
1785  * the values in anv_gfx_dynamic_state, flagging HW instructions for
1786  * reemission if the values are changing.
1787  *
1788  * Nothing is emitted in the batch buffer.
1789  */
1790 static void
cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state * hw_state,const struct anv_device * device,const struct vk_dynamic_graphics_state * dyn,struct anv_cmd_graphics_state * gfx,const struct anv_graphics_pipeline * pipeline,VkCommandBufferLevel cmd_buffer_level)1791 cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
1792                                    const struct anv_device *device,
1793                                    const struct vk_dynamic_graphics_state *dyn,
1794                                    struct anv_cmd_graphics_state *gfx,
1795                                    const struct anv_graphics_pipeline *pipeline,
1796                                    VkCommandBufferLevel cmd_buffer_level)
1797 {
1798    UNUSED bool fs_msaa_changed = false;
1799    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1800        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
1801        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
1802        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
1803       update_fs_msaa_flags(hw_state, dyn, pipeline);
1804 
1805    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1806        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_FS_MSAA_FLAGS)) {
1807       update_ps(hw_state, device, dyn, pipeline);
1808       update_ps_extra_wm(hw_state, pipeline);
1809    }
1810 
1811    if (gfx->dirty &
1812 #if GFX_VERx10 >= 125
1813        ANV_CMD_DIRTY_PIPELINE
1814 #else
1815        (ANV_CMD_DIRTY_PIPELINE | ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)
1816 #endif
1817       )
1818       update_ps_extra_has_uav(hw_state, gfx, pipeline);
1819 
1820    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1821        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE))
1822       update_ps_extra_kills_pixel(hw_state, dyn, gfx, pipeline);
1823 
1824    if ((gfx->dirty & ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE) ||
1825        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
1826        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
1827       update_streamout(hw_state, dyn, gfx, pipeline);
1828 
1829    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX))
1830       update_provoking_vertex(hw_state, dyn, pipeline);
1831 
1832    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1833        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY))
1834       update_topology(hw_state, dyn, pipeline);
1835 
1836    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1837        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1838        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1839        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
1840       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
1841 
1842 #if GFX_VER >= 11
1843    if (device->vk.enabled_extensions.KHR_fragment_shading_rate &&
1844        ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1845         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR) ||
1846         BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_FS_MSAA_FLAGS)))
1847       update_cps(hw_state, device, dyn, pipeline);
1848 #endif /* GFX_VER >= 11 */
1849 
1850    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1851        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN))
1852       update_te(hw_state, dyn, pipeline);
1853 
1854    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
1855       update_line_width(hw_state, dyn);
1856 
1857    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS))
1858       update_sf_global_depth_bias(hw_state, dyn);
1859 
1860    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE))
1861       update_clip_api_mode(hw_state, dyn);
1862 
1863    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
1864       update_clip_max_viewport(hw_state, dyn);
1865 
1866    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1867        (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
1868        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
1869        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
1870        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
1871        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
1872        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
1873        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) ||
1874        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) ||
1875        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
1876        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
1877        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
1878        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE))
1879       update_clip_raster(hw_state, dyn, gfx, pipeline);
1880 
1881    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES))
1882       update_multisample(hw_state, dyn);
1883 
1884    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK))
1885       update_sample_mask(hw_state, dyn);
1886 
1887    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
1888 #if GFX_VER == 9
1889        /* For the PMA fix */
1890        (gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1891 #endif
1892        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
1893        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
1894        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
1895        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
1896        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
1897        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
1898        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
1899        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE))
1900       update_wm_depth_stencil(hw_state, dyn, gfx, device);
1901 
1902 #if GFX_VER >= 12
1903    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
1904        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS))
1905       update_depth_bounds(hw_state, dyn);
1906 #endif
1907 
1908    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) ||
1909        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
1910       update_line_stipple(hw_state, dyn);
1911 
1912    if ((gfx->dirty & ANV_CMD_DIRTY_RESTART_INDEX) ||
1913        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
1914       update_vf_restart(hw_state, dyn, gfx);
1915 
1916    if (gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER)
1917       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER);
1918 
1919 #if GFX_VERx10 >= 125
1920    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
1921       update_vfg_list_cut_index(hw_state, dyn);
1922 #endif
1923 
1924    if (device->vk.enabled_extensions.EXT_sample_locations &&
1925        (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
1926         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)))
1927       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN);
1928 
1929    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1930        (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
1931        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
1932        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
1933        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) ||
1934        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) ||
1935        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
1936        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
1937        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) ||
1938        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
1939       const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1940       update_blend_state(hw_state, dyn, gfx, device,
1941                          wm_prog_data != NULL,
1942                          wm_prog_data != NULL ?
1943                          wm_prog_data->dual_src_blend : false);
1944    }
1945 
1946    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
1947       update_blend_constants(hw_state, dyn, gfx);
1948 
1949    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
1950        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1951        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
1952        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
1953        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
1954        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLAMP_RANGE))
1955       update_viewports(hw_state, dyn, gfx, device);
1956 
1957    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
1958        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
1959        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS))
1960       update_scissors(hw_state, dyn, gfx, cmd_buffer_level);
1961 
1962 #if GFX_VERx10 == 125
1963    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS))
1964       update_tbimr_info(hw_state, device, gfx, pipeline->base.base.l3_config);
1965 #endif
1966 
1967 #if INTEL_WA_14018283232_GFX_VER
1968    if (intel_needs_workaround(device->info, 14018283232) &&
1969        ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1970         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE))) {
1971       const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1972       SET(WA_14018283232, wa_14018283232_toggle,
1973           dyn->ds.depth.bounds_test.enable &&
1974           wm_prog_data &&
1975           wm_prog_data->uses_kill);
1976    }
1977 #endif
1978 
1979    /* If the pipeline uses a dynamic value of patch_control_points and either
1980     * the pipeline change or the dynamic value change, check the value and
1981     * reemit if needed.
1982     */
1983    if (pipeline->dynamic_patch_control_points &&
1984        ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
1985         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)))
1986       SET(TCS_INPUT_VERTICES, tcs_input_vertices, dyn->ts.patch_control_points);
1987 }
1988 
1989 #undef GET
1990 #undef SET
1991 #undef SET_STAGE
1992 #undef SETUP_PROVOKING_VERTEX
1993 
1994 /**
1995  * This function takes the vulkan runtime values & dirty states and updates
1996  * the values in anv_gfx_dynamic_state, flagging HW instructions for
1997  * reemission if the values are changing.
1998  *
1999  * Nothing is emitted in the batch buffer.
2000  */
2001 void
genX(cmd_buffer_flush_gfx_runtime_state)2002 genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
2003 {
2004    cmd_buffer_flush_gfx_runtime_state(
2005       &cmd_buffer->state.gfx.dyn_state,
2006       cmd_buffer->device,
2007       &cmd_buffer->vk.dynamic_graphics_state,
2008       &cmd_buffer->state.gfx,
2009       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline),
2010       cmd_buffer->vk.level);
2011 
2012    vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
2013 }
2014 
2015 static void
emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer * cmd_buffer)2016 emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer)
2017 {
2018 #if GFX_VERx10 >= 125
2019    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
2020       vfg.DistributionMode = RR_STRICT;
2021    }
2022    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
2023       vf.GeometryDistributionEnable = true;
2024    }
2025 #endif
2026 
2027 #if GFX_VER >= 12
2028    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
2029       pr.ReplicaMask = 1;
2030    }
2031 #endif
2032 
2033    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) {
2034       rr.CullMode = CULLMODE_NONE;
2035       rr.FrontFaceFillMode = FILL_MODE_SOLID;
2036       rr.BackFaceFillMode = FILL_MODE_SOLID;
2037    }
2038 
2039    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero);
2040    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero);
2041 
2042 #if GFX_VER >= 11
2043    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero);
2044 #endif
2045 
2046    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) {
2047       clip.ClipEnable = true;
2048       clip.ClipMode = CLIPMODE_REJECT_ALL;
2049    }
2050 
2051    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero);
2052    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero);
2053    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero);
2054    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero);
2055    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero);
2056    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero);
2057 
2058    uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2,
2059                                                GENX(3DSTATE_VERTEX_ELEMENTS));
2060    uint32_t *ve_pack_dest = &vertex_elements[1];
2061 
2062    for (int i = 0; i < 2; i++) {
2063       struct GENX(VERTEX_ELEMENT_STATE) element = {
2064          .Valid = true,
2065          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
2066          .Component0Control = VFCOMP_STORE_0,
2067          .Component1Control = VFCOMP_STORE_0,
2068          .Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
2069          .Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
2070       };
2071       GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element);
2072       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
2073    }
2074 
2075    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
2076       topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
2077    }
2078 
2079    /* Emit dummy draw per slice. */
2080    for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) {
2081       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
2082          prim.VertexCountPerInstance = 3;
2083          prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
2084          prim.InstanceCount = 1;
2085          prim.VertexAccessType = SEQUENTIAL;
2086       }
2087    }
2088 }
2089 
2090 #if INTEL_WA_14018283232_GFX_VER
2091 void
genX(batch_emit_wa_14018283232)2092 genX(batch_emit_wa_14018283232)(struct anv_batch *batch)
2093 {
2094    anv_batch_emit(batch, GENX(RESOURCE_BARRIER), barrier) {
2095       barrier.ResourceBarrierBody = (struct GENX(RESOURCE_BARRIER_BODY)) {
2096          .BarrierType = RESOURCE_BARRIER_TYPE_IMMEDIATE,
2097          .SignalStage = RESOURCE_BARRIER_STAGE_COLOR,
2098             .WaitStage = RESOURCE_BARRIER_STAGE_PIXEL,
2099       };
2100    }
2101 }
2102 #endif
2103 
2104 /**
2105  * This function handles dirty state emission to the batch buffer.
2106  */
2107 static void
cmd_buffer_gfx_state_emission(struct anv_cmd_buffer * cmd_buffer)2108 cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
2109 {
2110    struct anv_device *device = cmd_buffer->device;
2111    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2112    struct anv_graphics_pipeline *pipeline =
2113       anv_pipeline_to_graphics(gfx->base.pipeline);
2114    const struct vk_dynamic_graphics_state *dyn =
2115       &cmd_buffer->vk.dynamic_graphics_state;
2116    struct anv_push_constants *push_consts =
2117       &cmd_buffer->state.gfx.base.push_constants;
2118    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
2119    const bool protected = cmd_buffer->vk.pool->flags &
2120                           VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
2121 
2122 #if INTEL_WA_16011107343_GFX_VER
2123    /* Will be emitted in front of every draw instead */
2124    if (intel_needs_workaround(device->info, 16011107343) &&
2125        anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
2126       BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_HS);
2127 #endif
2128 
2129 #if INTEL_WA_22018402687_GFX_VER
2130    /* Will be emitted in front of every draw instead */
2131    if (intel_needs_workaround(device->info, 22018402687) &&
2132        anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
2133       BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_DS);
2134 #endif
2135 
2136    /*
2137     * Values provided by push constants
2138     */
2139 
2140    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TCS_INPUT_VERTICES)) {
2141       push_consts->gfx.tcs_input_vertices = dyn->ts.patch_control_points;
2142       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
2143       gfx->base.push_constants_data_dirty = true;
2144    }
2145 
2146    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_FS_MSAA_FLAGS)) {
2147       push_consts->gfx.fs_msaa_flags = hw_state->fs_msaa_flags;
2148       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
2149       gfx->base.push_constants_data_dirty = true;
2150    }
2151 
2152    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) {
2153       genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg);
2154 
2155       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb);
2156 
2157       memcpy(&gfx->urb_cfg, &pipeline->urb_cfg,
2158              sizeof(struct intel_urb_config));
2159    }
2160 
2161    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION))
2162       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.primitive_replication);
2163 
2164    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_INSTANCING))
2165       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_instancing);
2166 
2167    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS))
2168       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs);
2169 
2170 #if GFX_VER >= 11
2171    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2))
2172       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_2);
2173 #endif
2174 
2175    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VS)) {
2176       anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
2177                                               final.vs, protected);
2178    }
2179 
2180    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_HS)) {
2181       anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
2182                                               final.hs, protected);
2183    }
2184 
2185    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DS)) {
2186       anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
2187                                               final.ds, protected);
2188    }
2189 
2190    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS)) {
2191       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
2192          vfs.StatisticsEnable = true;
2193       }
2194    }
2195 
2196    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE))
2197       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe);
2198 
2199    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ))
2200       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_swiz);
2201 
2202    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
2203       /* Wa_16011773973:
2204        * If SOL is enabled and SO_DECL state has to be programmed,
2205        *    1. Send 3D State SOL state with SOL disabled
2206        *    2. Send SO_DECL NP state
2207        *    3. Send 3D State SOL with SOL Enabled
2208        */
2209       if (intel_needs_workaround(device->info, 16011773973) &&
2210           pipeline->uses_xfb)
2211          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so);
2212 
2213       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
2214                                     final.so_decl_list);
2215 
2216 #if GFX_VER >= 11 && GFX_VER < 20
2217       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
2218        * 3DSTATE_SO_DECL_LIST:
2219        *
2220        *    "Workaround: This command must be followed by a PIPE_CONTROL with
2221        *     CS Stall bit set."
2222        *
2223        * On DG2+ also known as Wa_1509820217.
2224        */
2225       genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
2226                                    cmd_buffer->state.current_pipeline,
2227                                    ANV_PIPE_CS_STALL_BIT);
2228 #endif
2229    }
2230 
2231    if (device->vk.enabled_extensions.EXT_mesh_shader) {
2232       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL)) {
2233          anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
2234                                                  final.mesh_control, protected);
2235       }
2236 
2237       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER))
2238          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_shader);
2239 
2240       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB))
2241          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_distrib);
2242 
2243       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL)) {
2244          anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
2245                                                  final.task_control, protected);
2246       }
2247 
2248       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER))
2249          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_shader);
2250 
2251       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB))
2252          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_redistrib);
2253 
2254       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH))
2255          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_mesh);
2256 
2257       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH))
2258          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.clip_mesh);
2259    } else {
2260       assert(!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL) &&
2261              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER) &&
2262              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB) &&
2263              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL) &&
2264              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER) &&
2265              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB) &&
2266              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH) &&
2267              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH));
2268    }
2269 
2270 #define INIT(category, name) \
2271    .name = hw_state->category.name
2272 #define SET(s, category, name) \
2273    s.name = hw_state->category.name
2274 
2275    /* Now the potentially dynamic instructions */
2276 
2277    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS)) {
2278       anv_batch_emit_merge_protected(&cmd_buffer->batch, GENX(3DSTATE_PS),
2279                                      pipeline, partial.ps, ps, protected) {
2280          SET(ps, ps, KernelStartPointer0);
2281          SET(ps, ps, KernelStartPointer1);
2282          SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
2283          SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);
2284 
2285 #if GFX_VER < 20
2286          SET(ps, ps, KernelStartPointer2);
2287          SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);
2288 
2289          SET(ps, ps, _8PixelDispatchEnable);
2290          SET(ps, ps, _16PixelDispatchEnable);
2291          SET(ps, ps, _32PixelDispatchEnable);
2292 #else
2293          SET(ps, ps, Kernel0Enable);
2294          SET(ps, ps, Kernel1Enable);
2295          SET(ps, ps, Kernel0SIMDWidth);
2296          SET(ps, ps, Kernel1SIMDWidth);
2297          SET(ps, ps, Kernel0PolyPackingPolicy);
2298 #endif
2299          SET(ps, ps, PositionXYOffsetSelect);
2300       }
2301    }
2302 
2303    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA) ||
2304        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_COARSE_STATE)) {
2305       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS_EXTRA),
2306                            pipeline, partial.ps_extra, pse) {
2307          SET(pse, ps_extra, PixelShaderHasUAV);
2308          SET(pse, ps_extra, PixelShaderIsPerSample);
2309 #if GFX_VER >= 11
2310          SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
2311 #endif
2312          SET(pse, ps_extra, PixelShaderKillsPixel);
2313 
2314 #if INTEL_WA_18038825448_GFX_VER
2315          /* Add a dependency if easier the shader needs it (because of runtime
2316           * change through pre-rasterization shader) or if we notice a change.
2317           */
2318          pse.EnablePSDependencyOnCPsizeChange =
2319             hw_state->ps_extra.EnablePSDependencyOnCPsizeChange ||
2320             BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_COARSE_STATE);
2321 #elif GFX_VERx10 >= 125
2322          SET(pse, ps_extra, EnablePSDependencyOnCPsizeChange);
2323 #endif
2324       }
2325    }
2326 
2327    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP)) {
2328       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
2329                            pipeline, partial.clip, clip) {
2330          SET(clip, clip, APIMode);
2331          SET(clip, clip, ViewportXYClipTestEnable);
2332          SET(clip, clip, TriangleStripListProvokingVertexSelect);
2333          SET(clip, clip, LineStripListProvokingVertexSelect);
2334          SET(clip, clip, TriangleFanProvokingVertexSelect);
2335          SET(clip, clip, MaximumVPIndex);
2336       }
2337    }
2338 
2339    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_STREAMOUT)) {
2340       genX(streamout_prologue)(cmd_buffer);
2341 
2342       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
2343                            pipeline, partial.so, so) {
2344          SET(so, so, RenderingDisable);
2345          SET(so, so, RenderStreamSelect);
2346          SET(so, so, ReorderMode);
2347          SET(so, so, ForceRendering);
2348       }
2349    }
2350 
2351    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP)) {
2352       struct anv_state sf_clip_state =
2353          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2354                                             hw_state->vp_sf_clip.count * 64, 64);
2355 
2356       for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) {
2357          struct GENX(SF_CLIP_VIEWPORT) sfv = {
2358             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00),
2359             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11),
2360             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22),
2361             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30),
2362             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31),
2363             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32),
2364             INIT(vp_sf_clip.elem[i], XMinClipGuardband),
2365             INIT(vp_sf_clip.elem[i], XMaxClipGuardband),
2366             INIT(vp_sf_clip.elem[i], YMinClipGuardband),
2367             INIT(vp_sf_clip.elem[i], YMaxClipGuardband),
2368             INIT(vp_sf_clip.elem[i], XMinViewPort),
2369             INIT(vp_sf_clip.elem[i], XMaxViewPort),
2370             INIT(vp_sf_clip.elem[i], YMinViewPort),
2371             INIT(vp_sf_clip.elem[i], YMaxViewPort),
2372          };
2373          GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
2374       }
2375 
2376       anv_batch_emit(&cmd_buffer->batch,
2377                      GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
2378          clip.SFClipViewportPointer = sf_clip_state.offset;
2379       }
2380    }
2381 
2382    /* Force CC_VIEWPORT reallocation on Gfx9 when reprogramming
2383     * 3DSTATE_VIEWPORT_STATE_POINTERS_CC :
2384     *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
2385     */
2386    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
2387        (GFX_VER == 9 &&
2388         BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR))) {
2389       hw_state->vp_cc.state =
2390          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2391                                             hw_state->vp_cc.count * 8, 32);
2392 
2393       for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) {
2394          struct GENX(CC_VIEWPORT) cc_viewport = {
2395             INIT(vp_cc.elem[i], MinimumDepth),
2396             INIT(vp_cc.elem[i], MaximumDepth),
2397          };
2398          GENX(CC_VIEWPORT_pack)(NULL, hw_state->vp_cc.state.map + i * 8,
2399                                 &cc_viewport);
2400       }
2401 
2402       /* Dirty the pointers to reemit 3DSTATE_VIEWPORT_STATE_POINTERS_CC below
2403        */
2404       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR);
2405    }
2406 
2407    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) {
2408       anv_batch_emit(&cmd_buffer->batch,
2409                      GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
2410          cc.CCViewportPointer = hw_state->vp_cc.state.offset;
2411       }
2412       cmd_buffer->state.gfx.viewport_set = true;
2413    }
2414 
2415    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR)) {
2416       /* Wa_1409725701:
2417        *
2418        *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
2419        *    stored as an array of up to 16 elements. The location of first
2420        *    element of the array, as specified by Pointer to SCISSOR_RECT,
2421        *    should be aligned to a 64-byte boundary.
2422        */
2423       struct anv_state scissor_state =
2424          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2425                                             hw_state->scissor.count * 8, 64);
2426 
2427       for (uint32_t i = 0; i < hw_state->scissor.count; i++) {
2428          struct GENX(SCISSOR_RECT) scissor = {
2429             INIT(scissor.elem[i], ScissorRectangleYMin),
2430             INIT(scissor.elem[i], ScissorRectangleXMin),
2431             INIT(scissor.elem[i], ScissorRectangleYMax),
2432             INIT(scissor.elem[i], ScissorRectangleXMax),
2433          };
2434          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
2435       }
2436 
2437       anv_batch_emit(&cmd_buffer->batch,
2438                      GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
2439          ssp.ScissorRectPointer = scissor_state.offset;
2440       }
2441    }
2442 
2443    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY)) {
2444       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
2445          SET(vft, vft, PrimitiveTopologyType);
2446       }
2447    }
2448 
2449    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT)) {
2450       genX(batch_emit_vertex_input)(&cmd_buffer->batch, device,
2451                                     pipeline, dyn->vi);
2452    }
2453 
2454    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TE)) {
2455       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
2456                            pipeline, partial.te, te) {
2457          SET(te, te, OutputTopology);
2458       }
2459    }
2460 
2461    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_GS)) {
2462       anv_batch_emit_merge_protected(&cmd_buffer->batch, GENX(3DSTATE_GS),
2463                                      pipeline, partial.gs, gs, protected) {
2464          SET(gs, gs, ReorderMode);
2465       }
2466    }
2467 
2468 #if GFX_VER >= 30
2469    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_COARSE_PIXEL)) {
2470       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_COARSE_PIXEL), coarse_pixel) {
2471          coarse_pixel.DisableCPSPointers = true;
2472          SET(coarse_pixel, coarse_pixel, CPSizeX);
2473          SET(coarse_pixel, coarse_pixel, CPSizeY);
2474          SET(coarse_pixel, coarse_pixel, CPSizeCombiner0Opcode);
2475          SET(coarse_pixel, coarse_pixel, CPSizeCombiner1Opcode);
2476       }
2477    }
2478 #else
2479    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CPS)) {
2480 #if GFX_VER == 11
2481       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS), cps) {
2482          SET(cps, cps, CoarsePixelShadingMode);
2483          SET(cps, cps, MinCPSizeX);
2484          SET(cps, cps, MinCPSizeY);
2485       }
2486 #elif GFX_VER >= 12
2487       /* TODO: we can optimize this flush in the following cases:
2488        *
2489        *    In the case where the last geometry shader emits a value that is
2490        *    not constant, we can avoid this stall because we can synchronize
2491        *    the pixel shader internally with
2492        *    3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
2493        *
2494        *    If we know that the previous pipeline and the current one are
2495        *    using the same fragment shading rate.
2496        */
2497       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2498 #if GFX_VERx10 >= 125
2499          pc.PSSStallSyncEnable = true;
2500 #else
2501          pc.PSDSyncEnable = true;
2502 #endif
2503       }
2504 
2505       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS_POINTERS), cps) {
2506          SET(cps, cps, CoarsePixelShadingStateArrayPointer);
2507       }
2508 #endif
2509    }
2510 #endif /* GFX_VER >= 30 */
2511 
2512    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SF)) {
2513       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
2514                            pipeline, partial.sf, sf) {
2515          SET(sf, sf, LineWidth);
2516          SET(sf, sf, TriangleStripListProvokingVertexSelect);
2517          SET(sf, sf, LineStripListProvokingVertexSelect);
2518          SET(sf, sf, TriangleFanProvokingVertexSelect);
2519          SET(sf, sf, LegacyGlobalDepthBiasEnable);
2520       }
2521    }
2522 
2523    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_RASTER)) {
2524       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), raster) {
2525          /* For details on 3DSTATE_RASTER multisample state, see the BSpec
2526           * table "Multisample Modes State".
2527           *
2528           * NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the SKL PMA fix
2529           * computations. If we ever set this bit to a different value, they
2530           * will need to be updated accordingly.
2531           */
2532          raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
2533          raster.ForceMultisampling = false;
2534          raster.ScissorRectangleEnable = true;
2535 
2536          SET(raster, raster, APIMode);
2537          SET(raster, raster, DXMultisampleRasterizationEnable);
2538          SET(raster, raster, AntialiasingEnable);
2539          SET(raster, raster, CullMode);
2540          SET(raster, raster, FrontWinding);
2541          SET(raster, raster, GlobalDepthOffsetEnableSolid);
2542          SET(raster, raster, GlobalDepthOffsetEnableWireframe);
2543          SET(raster, raster, GlobalDepthOffsetEnablePoint);
2544          SET(raster, raster, GlobalDepthOffsetConstant);
2545          SET(raster, raster, GlobalDepthOffsetScale);
2546          SET(raster, raster, GlobalDepthOffsetClamp);
2547          SET(raster, raster, FrontFaceFillMode);
2548          SET(raster, raster, BackFaceFillMode);
2549          SET(raster, raster, ViewportZFarClipTestEnable);
2550          SET(raster, raster, ViewportZNearClipTestEnable);
2551          SET(raster, raster, ConservativeRasterizationEnable);
2552       }
2553    }
2554 
2555    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE)) {
2556       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MULTISAMPLE), ms) {
2557          ms.PixelLocation              = CENTER;
2558 
2559          /* The PRM says that this bit is valid only for DX9:
2560           *
2561           *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
2562           *    should not have any effect by setting or not setting this bit.
2563           */
2564          ms.PixelPositionOffsetEnable  = false;
2565 
2566          SET(ms, ms, NumberofMultisamples);
2567       }
2568    }
2569 
2570    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE)) {
2571       hw_state->cc.state =
2572          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2573                                             GENX(COLOR_CALC_STATE_length) * 4,
2574                                             64);
2575       struct GENX(COLOR_CALC_STATE) cc = {
2576          INIT(cc, BlendConstantColorRed),
2577          INIT(cc, BlendConstantColorGreen),
2578          INIT(cc, BlendConstantColorBlue),
2579          INIT(cc, BlendConstantColorAlpha),
2580       };
2581       GENX(COLOR_CALC_STATE_pack)(NULL, hw_state->cc.state.map, &cc);
2582 
2583       /* Dirty the pointers to reemit 3DSTATE_CC_STATE_POINTERS below
2584        */
2585       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR);
2586    }
2587 
2588    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR)) {
2589       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
2590          ccp.ColorCalcStatePointer = hw_state->cc.state.offset;
2591          ccp.ColorCalcStatePointerValid = true;
2592       }
2593    }
2594 
2595    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK)) {
2596       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
2597          SET(sm, sm, SampleMask);
2598       }
2599    }
2600 
2601    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL)) {
2602       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
2603          SET(ds, ds, DoubleSidedStencilEnable);
2604          SET(ds, ds, StencilTestMask);
2605          SET(ds, ds, StencilWriteMask);
2606          SET(ds, ds, BackfaceStencilTestMask);
2607          SET(ds, ds, BackfaceStencilWriteMask);
2608          SET(ds, ds, StencilReferenceValue);
2609          SET(ds, ds, BackfaceStencilReferenceValue);
2610          SET(ds, ds, DepthTestEnable);
2611          SET(ds, ds, DepthBufferWriteEnable);
2612          SET(ds, ds, DepthTestFunction);
2613          SET(ds, ds, StencilTestEnable);
2614          SET(ds, ds, StencilBufferWriteEnable);
2615          SET(ds, ds, StencilFailOp);
2616          SET(ds, ds, StencilPassDepthPassOp);
2617          SET(ds, ds, StencilPassDepthFailOp);
2618          SET(ds, ds, StencilTestFunction);
2619          SET(ds, ds, BackfaceStencilFailOp);
2620          SET(ds, ds, BackfaceStencilPassDepthPassOp);
2621          SET(ds, ds, BackfaceStencilPassDepthFailOp);
2622          SET(ds, ds, BackfaceStencilTestFunction);
2623       }
2624    }
2625 
2626 #if GFX_VER >= 12
2627    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS)) {
2628       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
2629          SET(db, db, DepthBoundsTestEnable);
2630          SET(db, db, DepthBoundsTestMinValue);
2631          SET(db, db, DepthBoundsTestMaxValue);
2632       }
2633    }
2634 #endif
2635 
2636    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_LINE_STIPPLE)) {
2637       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
2638          SET(ls, ls, LineStipplePattern);
2639          SET(ls, ls, LineStippleInverseRepeatCount);
2640          SET(ls, ls, LineStippleRepeatCount);
2641       }
2642 #if GFX_VER >= 11
2643       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
2644        * 3DSTATE_LINE_STIPPLE:
2645        *
2646        *    "Workaround: This command must be followed by a PIPE_CONTROL with
2647        *     CS Stall bit set."
2648        */
2649       genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
2650                                    cmd_buffer->state.current_pipeline,
2651                                    ANV_PIPE_CS_STALL_BIT);
2652 #endif
2653    }
2654 
2655    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF)) {
2656       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
2657 #if GFX_VERx10 >= 125
2658          vf.GeometryDistributionEnable = true;
2659 #endif
2660          SET(vf, vf, IndexedDrawCutIndexEnable);
2661          SET(vf, vf, CutIndex);
2662       }
2663    }
2664 
2665    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER)) {
2666       struct anv_buffer *buffer = gfx->index_buffer;
2667       uint32_t offset = gfx->index_offset;
2668       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
2669          ib.IndexFormat           = gfx->index_type;
2670          ib.MOCS                  = anv_mocs(device,
2671                                              buffer ? buffer->address.bo : NULL,
2672                                              ISL_SURF_USAGE_INDEX_BUFFER_BIT);
2673 #if GFX_VER >= 12
2674          ib.L3BypassDisable       = true;
2675 #endif
2676          if (buffer) {
2677             ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
2678             ib.BufferSize            = gfx->index_size;
2679          }
2680       }
2681    }
2682 
2683 #if GFX_VERx10 >= 125
2684    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VFG)) {
2685       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
2686                            pipeline, partial.vfg, vfg) {
2687          SET(vfg, vfg, ListCutIndexEnable);
2688       }
2689    }
2690 #endif
2691 
2692    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN)) {
2693       genX(emit_sample_pattern)(&cmd_buffer->batch,
2694                                 dyn->ms.sample_locations_enable ?
2695                                 dyn->ms.sample_locations : NULL);
2696    }
2697 
2698    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM)) {
2699       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
2700                            pipeline, partial.wm, wm) {
2701          SET(wm, wm, LineStippleEnable);
2702          SET(wm, wm, BarycentricInterpolationMode);
2703       }
2704    }
2705 
2706    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_BLEND)) {
2707       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS_BLEND), blend) {
2708          SET(blend, ps_blend, HasWriteableRT);
2709          SET(blend, ps_blend, ColorBufferBlendEnable);
2710          SET(blend, ps_blend, SourceAlphaBlendFactor);
2711          SET(blend, ps_blend, DestinationAlphaBlendFactor);
2712          SET(blend, ps_blend, SourceBlendFactor);
2713          SET(blend, ps_blend, DestinationBlendFactor);
2714          SET(blend, ps_blend, AlphaTestEnable);
2715          SET(blend, ps_blend, IndependentAlphaBlendEnable);
2716          SET(blend, ps_blend, AlphaToCoverageEnable);
2717       }
2718    }
2719 
2720    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE)) {
2721       const uint32_t num_dwords = GENX(BLEND_STATE_length) +
2722          GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
2723       hw_state->blend.state =
2724          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
2725                                             num_dwords * 4,
2726                                             64);
2727 
2728       uint32_t *dws = hw_state->blend.state.map;
2729 
2730       struct GENX(BLEND_STATE) blend_state = {
2731          INIT(blend, AlphaToCoverageEnable),
2732          INIT(blend, AlphaToOneEnable),
2733          INIT(blend, IndependentAlphaBlendEnable),
2734          INIT(blend, ColorDitherEnable),
2735       };
2736       GENX(BLEND_STATE_pack)(NULL, dws, &blend_state);
2737 
2738       /* Jump to blend entries. */
2739       dws += GENX(BLEND_STATE_length);
2740       for (uint32_t i = 0; i < MAX_RTS; i++) {
2741          struct GENX(BLEND_STATE_ENTRY) entry = {
2742             INIT(blend.rts[i], WriteDisableAlpha),
2743             INIT(blend.rts[i], WriteDisableRed),
2744             INIT(blend.rts[i], WriteDisableGreen),
2745             INIT(blend.rts[i], WriteDisableBlue),
2746             INIT(blend.rts[i], LogicOpFunction),
2747             INIT(blend.rts[i], LogicOpEnable),
2748             INIT(blend.rts[i], ColorBufferBlendEnable),
2749             INIT(blend.rts[i], ColorClampRange),
2750             INIT(blend.rts[i], PreBlendColorClampEnable),
2751             INIT(blend.rts[i], PostBlendColorClampEnable),
2752             INIT(blend.rts[i], SourceBlendFactor),
2753             INIT(blend.rts[i], DestinationBlendFactor),
2754             INIT(blend.rts[i], ColorBlendFunction),
2755             INIT(blend.rts[i], SourceAlphaBlendFactor),
2756             INIT(blend.rts[i], DestinationAlphaBlendFactor),
2757             INIT(blend.rts[i], AlphaBlendFunction),
2758          };
2759 
2760          GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
2761          dws += GENX(BLEND_STATE_ENTRY_length);
2762       }
2763 
2764       /* Dirty the pointers to reemit 3DSTATE_BLEND_STATE_POINTERS below */
2765       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR);
2766    }
2767 
2768    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR)) {
2769       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
2770          bsp.BlendStatePointer      = hw_state->blend.state.offset;
2771          bsp.BlendStatePointerValid = true;
2772       }
2773    }
2774 
2775 #if INTEL_WA_18019816803_GFX_VER
2776    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_18019816803)) {
2777       genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
2778                                    cmd_buffer->state.current_pipeline,
2779                                    ANV_PIPE_PSS_STALL_SYNC_BIT);
2780    }
2781 #endif
2782 
2783 #if INTEL_WA_14018283232_GFX_VER
2784    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_14018283232))
2785       genX(batch_emit_wa_14018283232)(&cmd_buffer->batch);
2786 #endif
2787 
2788 #if GFX_VER == 9
2789    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PMA_FIX))
2790       genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix);
2791 #endif
2792 
2793 #if GFX_VERx10 >= 125
2794    if (hw_state->use_tbimr &&
2795        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TBIMR_TILE_PASS_INFO)) {
2796       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO),
2797                      tbimr) {
2798          SET(tbimr, tbimr, TileRectangleHeight);
2799          SET(tbimr, tbimr, TileRectangleWidth);
2800          SET(tbimr, tbimr, VerticalTileCount);
2801          SET(tbimr, tbimr, HorizontalTileCount);
2802          SET(tbimr, tbimr, TBIMRBatchSize);
2803          SET(tbimr, tbimr, TileBoxCheck);
2804       }
2805    }
2806 #endif
2807 
2808 #undef INIT
2809 #undef SET
2810 
2811    BITSET_ZERO(hw_state->dirty);
2812 }
2813 
2814 /**
2815  * This function handles possible state workarounds and emits the dirty
2816  * instructions to the batch buffer.
2817  */
2818 void
genX(cmd_buffer_flush_gfx_hw_state)2819 genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
2820 {
2821    struct anv_device *device = cmd_buffer->device;
2822    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2823    struct anv_graphics_pipeline *pipeline =
2824       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2825    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
2826 
2827    if (INTEL_DEBUG(DEBUG_REEMIT)) {
2828       BITSET_OR(gfx->dyn_state.dirty, gfx->dyn_state.dirty,
2829                 device->gfx_dirty_state);
2830    }
2831 
2832    /**
2833     * Put potential workarounds here if you need to reemit an instruction
2834     * because of another one is changing.
2835     */
2836 
2837    /* Wa_16012775297 - Emit dummy VF statistics before each 3DSTATE_VF. */
2838 #if INTEL_WA_16012775297_GFX_VER
2839    if (intel_needs_workaround(device->info, 16012775297) &&
2840        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF))
2841       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2842 #endif
2843 
2844    /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit
2845     * it after.
2846     */
2847    if (intel_needs_workaround(device->info, 16011773973) &&
2848        pipeline->uses_xfb &&
2849        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
2850       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2851    }
2852 
2853 #if INTEL_WA_18038825448_GFX_VER
2854    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2855    if (wm_prog_data) {
2856       genX(cmd_buffer_set_coarse_pixel_active)(
2857          cmd_buffer,
2858          brw_wm_prog_data_is_coarse(wm_prog_data, hw_state->fs_msaa_flags));
2859    }
2860 #endif
2861 
2862    /* Gfx11 undocumented issue :
2863     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781
2864     */
2865 #if GFX_VER == 11
2866    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE))
2867       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
2868 #endif
2869 
2870    /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
2871    if (intel_needs_workaround(device->info, 18020335297) &&
2872        (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
2873         BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) &&
2874        cmd_buffer->state.gfx.viewport_set) {
2875       /* For mesh, we implement the WA using CS stall. This is for
2876        * simplicity and takes care of possible interaction with Wa_16014390852.
2877        */
2878       if (anv_pipeline_is_mesh(pipeline)) {
2879          genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
2880                                       _3D, ANV_PIPE_CS_STALL_BIT);
2881       } else {
2882          /* Mask off all instructions that we program. */
2883          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VFG);
2884          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF);
2885          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
2886          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_RASTER);
2887          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2888          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
2889          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
2890          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_CLIP);
2891          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2892          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
2893          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
2894 
2895          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VS);
2896          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_GS);
2897          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_HS);
2898          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_TE);
2899          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_DS);
2900 
2901          cmd_buffer_gfx_state_emission(cmd_buffer);
2902 
2903          emit_wa_18020335297_dummy_draw(cmd_buffer);
2904 
2905          /* Dirty all emitted WA state to make sure that current real
2906           * state is restored.
2907           */
2908          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VFG);
2909          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
2910          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
2911          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
2912          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2913          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
2914          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
2915          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
2916          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2917          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
2918          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
2919 
2920          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
2921          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
2922          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
2923          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
2924          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
2925       }
2926    }
2927 
2928    cmd_buffer_gfx_state_emission(cmd_buffer);
2929 }
2930 
2931 void
genX(cmd_buffer_enable_pma_fix)2932 genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
2933 {
2934    if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
2935       return;
2936 
2937    if (cmd_buffer->state.gfx.pma_fix_enabled == enable)
2938       return;
2939 
2940    cmd_buffer->state.gfx.pma_fix_enabled = enable;
2941 
2942    /* According to the Broadwell PIPE_CONTROL documentation, software should
2943     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2944     * prior to the LRI.  If stencil buffer writes are enabled, then a Render
2945     * Cache Flush is also necessary.
2946     *
2947     * The Skylake docs say to use a depth stall rather than a command
2948     * streamer stall.  However, the hardware seems to violently disagree.
2949     * A full command streamer stall seems to be needed in both cases.
2950     */
2951    genx_batch_emit_pipe_control
2952       (&cmd_buffer->batch, cmd_buffer->device->info,
2953        cmd_buffer->state.current_pipeline,
2954        ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
2955        ANV_PIPE_CS_STALL_BIT |
2956 #if GFX_VER >= 12
2957        ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2958 #endif
2959        ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2960 
2961 #if GFX_VER == 9
2962    uint32_t cache_mode;
2963    anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
2964                    .STCPMAOptimizationEnable = enable,
2965                    .STCPMAOptimizationEnableMask = true);
2966    anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2967       lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
2968       lri.DataDWord        = cache_mode;
2969    }
2970 
2971 #endif /* GFX_VER == 9 */
2972 
2973    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2974     * Flush bits is often necessary.  We do it regardless because it's easier.
2975     * The render cache flush is also necessary if stencil writes are enabled.
2976     *
2977     * Again, the Skylake docs give a different set of flushes but the BDW
2978     * flushes seem to work just as well.
2979     */
2980    genx_batch_emit_pipe_control
2981       (&cmd_buffer->batch, cmd_buffer->device->info,
2982        cmd_buffer->state.current_pipeline,
2983        ANV_PIPE_DEPTH_STALL_BIT |
2984        ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
2985 #if GFX_VER >= 12
2986        ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2987 #endif
2988        ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2989 }
2990