• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include "anv_private.h"
31 
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34 #include "common/intel_guardband.h"
35 #include "common/intel_tiled_render.h"
36 #include "compiler/brw_prim.h"
37 
38 const uint32_t genX(vk_to_intel_blend)[] = {
39    [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
40    [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
41    [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
42    [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
43    [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
44    [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
45    [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
46    [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
47    [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
48    [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
49    [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
50    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
51    [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
52    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
53    [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
54    [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
55    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
56    [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
57    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
58 };
59 
60 static const uint32_t genX(vk_to_intel_blend_op)[] = {
61    [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
62    [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
63    [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
64    [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
65    [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
66 };
67 
68 static void
genX(streamout_prologue)69 genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer)
70 {
71 #if INTEL_WA_16013994831_GFX_VER
72    /* Wa_16013994831 - Disable preemption during streamout, enable back
73     * again if XFB not used by the current pipeline.
74     *
75     * Although this workaround applies to Gfx12+, we already disable object
76     * level preemption for another reason in genX_state.c so we can skip this
77     * for Gfx12.
78     */
79    if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
80       return;
81 
82    struct anv_graphics_pipeline *pipeline =
83       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
84    if (pipeline->uses_xfb) {
85       genX(cmd_buffer_set_preemption)(cmd_buffer, false);
86       return;
87    }
88 
89    if (!cmd_buffer->state.gfx.object_preemption)
90       genX(cmd_buffer_set_preemption)(cmd_buffer, true);
91 #endif
92 }
93 
94 #if GFX_VER >= 12
95 static uint32_t
get_cps_state_offset(struct anv_device * device,bool cps_enabled,const struct vk_fragment_shading_rate_state * fsr)96 get_cps_state_offset(struct anv_device *device, bool cps_enabled,
97                      const struct vk_fragment_shading_rate_state *fsr)
98 {
99    if (!cps_enabled)
100       return device->cps_states.offset;
101 
102    uint32_t offset;
103    static const uint32_t size_index[] = {
104       [1] = 0,
105       [2] = 1,
106       [4] = 2,
107    };
108 
109 #if GFX_VERx10 >= 125
110    offset =
111       1 + /* skip disabled */
112       fsr->combiner_ops[0] * 5 * 3 * 3 +
113       fsr->combiner_ops[1] * 3 * 3 +
114       size_index[fsr->fragment_size.width] * 3 +
115       size_index[fsr->fragment_size.height];
116 #else
117    offset =
118       1 + /* skip disabled */
119       size_index[fsr->fragment_size.width] * 3 +
120       size_index[fsr->fragment_size.height];
121 #endif
122 
123    offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
124 
125    return device->cps_states.offset + offset;
126 }
127 #endif /* GFX_VER >= 12 */
128 
129 static bool
has_ds_feedback_loop(const struct vk_dynamic_graphics_state * dyn)130 has_ds_feedback_loop(const struct vk_dynamic_graphics_state *dyn)
131 {
132    return dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT |
133                                  VK_IMAGE_ASPECT_STENCIL_BIT);
134 }
135 
136 UNUSED static bool
want_stencil_pma_fix(struct anv_cmd_buffer * cmd_buffer,const struct vk_dynamic_graphics_state * dyn,const struct vk_depth_stencil_state * ds)137 want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer,
138                      const struct vk_dynamic_graphics_state *dyn,
139                      const struct vk_depth_stencil_state *ds)
140 {
141    if (GFX_VER > 9)
142       return false;
143    assert(GFX_VER == 9);
144 
145    /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
146     *
147     *    Clearing this bit will force the STC cache to wait for pending
148     *    retirement of pixels at the HZ-read stage and do the STC-test for
149     *    Non-promoted, R-computed and Computed depth modes instead of
150     *    postponing the STC-test to RCPFE.
151     *
152     *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
153     *                  3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
154     *
155     *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
156     *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
157     *                    3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
158     *
159     *    COMP_STC_EN = STC_TEST_EN &&
160     *                  3DSTATE_PS_EXTRA::PixelShaderComputesStencil
161     *
162     *    SW parses the pipeline states to generate the following logical
163     *    signal indicating if PMA FIX can be enabled.
164     *
165     *    STC_PMA_OPT =
166     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
167     *       !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
168     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
169     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
170     *       !(3DSTATE_WM::EDSC_Mode == 2) &&
171     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
172     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
173     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
174     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
175     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
176     *       (COMP_STC_EN || STC_WRITE_EN) &&
177     *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
178     *         3DSTATE_WM::ForceKillPix == ON ||
179     *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
180     *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
181     *         3DSTATE_PS_BLEND::AlphaTestEnable ||
182     *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
183     *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
184     */
185 
186    /* These are always true:
187     *    3DSTATE_WM::ForceThreadDispatch != 1 &&
188     *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
189     */
190 
191    /* We only enable the PMA fix if we know for certain that HiZ is enabled.
192     * If we don't know whether HiZ is enabled or not, we disable the PMA fix
193     * and there is no harm.
194     *
195     * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
196     * 3DSTATE_DEPTH_BUFFER::HIZ Enable
197     */
198    if (!cmd_buffer->state.hiz_enabled)
199       return false;
200 
201    /* We can't possibly know if HiZ is enabled without the depth attachment */
202    ASSERTED const struct anv_image_view *d_iview =
203       cmd_buffer->state.gfx.depth_att.iview;
204    assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
205 
206    /* 3DSTATE_PS_EXTRA::PixelShaderValid */
207    struct anv_graphics_pipeline *pipeline =
208       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
209    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
210       return false;
211 
212    /* !(3DSTATE_WM::EDSC_Mode == 2) */
213    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
214    if (wm_prog_data->early_fragment_tests)
215       return false;
216 
217    /* We never use anv_pipeline for HiZ ops so this is trivially true:
218    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
219     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
220     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
221     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
222     */
223 
224    /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
225     * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
226     */
227    const bool stc_test_en = ds->stencil.test_enable;
228 
229    /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
230     * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
231     *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
232     */
233    const bool stc_write_en = ds->stencil.write_enable;
234 
235    /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
236    const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
237 
238    /* COMP_STC_EN || STC_WRITE_EN */
239    if (!(comp_stc_en || stc_write_en))
240       return false;
241 
242    /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
243     *  3DSTATE_WM::ForceKillPix == ON ||
244     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
245     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
246     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
247     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
248     * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
249     */
250    return pipeline->kill_pixel ||
251           pipeline->rp_has_ds_self_dep ||
252           has_ds_feedback_loop(dyn) ||
253           wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
254 }
255 
256 static void
genX(rasterization_mode)257 genX(rasterization_mode)(VkPolygonMode raster_mode,
258                          VkLineRasterizationModeKHR line_mode,
259                          float line_width,
260                          uint32_t *api_mode,
261                          bool *msaa_rasterization_enable)
262 {
263    if (raster_mode == VK_POLYGON_MODE_LINE) {
264       /* Unfortunately, configuring our line rasterization hardware on gfx8
265        * and later is rather painful.  Instead of giving us bits to tell the
266        * hardware what line mode to use like we had on gfx7, we now have an
267        * arcane combination of API Mode and MSAA enable bits which do things
268        * in a table which are expected to magically put the hardware into the
269        * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
270        * hardware people thought of so nothing works the way you want it to.
271        *
272        * Look at the table titled "Multisample Rasterization Modes" in Vol 7
273        * of the Skylake PRM for more details.
274        */
275       switch (line_mode) {
276       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
277          *api_mode = DX101;
278 #if GFX_VER <= 9
279          /* Prior to ICL, the algorithm the HW uses to draw wide lines
280           * doesn't quite match what the CTS expects, at least for rectangular
281           * lines, so we set this to false here, making it draw parallelograms
282           * instead, which work well enough.
283           */
284          *msaa_rasterization_enable = line_width < 1.0078125;
285 #else
286          *msaa_rasterization_enable = true;
287 #endif
288          break;
289 
290       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
291       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
292          *api_mode = DX9OGL;
293          *msaa_rasterization_enable = false;
294          break;
295 
296       default:
297          unreachable("Unsupported line rasterization mode");
298       }
299    } else {
300       *api_mode = DX101;
301       *msaa_rasterization_enable = true;
302    }
303 }
304 
305 #if GFX_VERx10 == 125
306 /**
307  * Return the dimensions of the current rendering area, defined as the
308  * bounding box of all present color, depth and stencil attachments.
309  */
310 UNUSED static bool
calculate_render_area(struct anv_cmd_buffer * cmd_buffer,unsigned * width,unsigned * height)311 calculate_render_area(struct anv_cmd_buffer *cmd_buffer,
312                       unsigned *width, unsigned *height)
313 {
314    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
315 
316    *width = gfx->render_area.offset.x + gfx->render_area.extent.width;
317    *height = gfx->render_area.offset.y + gfx->render_area.extent.height;
318 
319    for (unsigned i = 0; i < gfx->color_att_count; i++) {
320       struct anv_attachment *att = &gfx->color_att[i];
321       if (att->iview) {
322          *width = MAX2(*width, att->iview->vk.extent.width);
323          *height = MAX2(*height, att->iview->vk.extent.height);
324       }
325    }
326 
327    const struct anv_image_view *const z_view = gfx->depth_att.iview;
328    if (z_view) {
329       *width = MAX2(*width, z_view->vk.extent.width);
330       *height = MAX2(*height, z_view->vk.extent.height);
331    }
332 
333    const struct anv_image_view *const s_view = gfx->stencil_att.iview;
334    if (s_view) {
335       *width = MAX2(*width, s_view->vk.extent.width);
336       *height = MAX2(*height, s_view->vk.extent.height);
337    }
338 
339    return *width && *height;
340 }
341 
342 /* Calculate TBIMR tiling parameters adequate for the current pipeline
343  * setup.  Return true if TBIMR should be enabled.
344  */
345 UNUSED static bool
calculate_tile_dimensions(struct anv_cmd_buffer * cmd_buffer,unsigned fb_width,unsigned fb_height,unsigned * tile_width,unsigned * tile_height)346 calculate_tile_dimensions(struct anv_cmd_buffer *cmd_buffer,
347                           unsigned fb_width, unsigned fb_height,
348                           unsigned *tile_width, unsigned *tile_height)
349 {
350    const struct anv_device *device = cmd_buffer->device;
351    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
352    const unsigned aux_scale = 256;
353    unsigned pixel_size = 0;
354 
355    /* Perform a rough calculation of the tile cache footprint of the
356     * pixel pipeline, approximating it as the sum of the amount of
357     * memory used per pixel by every render target, depth, stencil and
358     * auxiliary surfaces bound to the pipeline.
359     */
360    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
361       struct anv_attachment *att = &gfx->color_att[i];
362 
363       if (att->iview) {
364          const struct anv_image *image = att->iview->image;
365          const unsigned p = anv_image_aspect_to_plane(image,
366                                                       VK_IMAGE_ASPECT_COLOR_BIT);
367          const struct anv_image_plane *plane = &image->planes[p];
368 
369          pixel_size += intel_calculate_surface_pixel_size(
370             &plane->primary_surface.isl);
371 
372          if (isl_aux_usage_has_mcs(att->aux_usage))
373             pixel_size += intel_calculate_surface_pixel_size(
374                &plane->aux_surface.isl);
375 
376          /* XXX - Use proper implicit CCS surface metadata tracking
377           *       instead of inferring pixel size from primary
378           *       surface.
379           */
380          if (isl_aux_usage_has_ccs(att->aux_usage))
381             pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
382                                           &plane->primary_surface.isl),
383                                        aux_scale);
384       }
385    }
386 
387    const struct anv_image_view *const z_view = gfx->depth_att.iview;
388    if (z_view) {
389       const struct anv_image *image = z_view->image;
390       assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
391       const unsigned p = anv_image_aspect_to_plane(image,
392                                                    VK_IMAGE_ASPECT_DEPTH_BIT);
393       const struct anv_image_plane *plane = &image->planes[p];
394 
395       pixel_size += intel_calculate_surface_pixel_size(
396          &plane->primary_surface.isl);
397 
398       if (isl_aux_usage_has_hiz(image->planes[p].aux_usage))
399          pixel_size += intel_calculate_surface_pixel_size(
400             &plane->aux_surface.isl);
401 
402       /* XXX - Use proper implicit CCS surface metadata tracking
403        *       instead of inferring pixel size from primary
404        *       surface.
405        */
406       if (isl_aux_usage_has_ccs(image->planes[p].aux_usage))
407          pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
408                                        &plane->primary_surface.isl),
409                                     aux_scale);
410    }
411 
412    const struct anv_image_view *const s_view = gfx->depth_att.iview;
413    if (s_view && s_view != z_view) {
414       const struct anv_image *image = s_view->image;
415       assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
416       const unsigned p = anv_image_aspect_to_plane(image,
417                                                    VK_IMAGE_ASPECT_STENCIL_BIT);
418       const struct anv_image_plane *plane = &image->planes[p];
419 
420       pixel_size += intel_calculate_surface_pixel_size(
421          &plane->primary_surface.isl);
422    }
423 
424    if (!pixel_size)
425       return false;
426 
427    /* Compute a tile layout that allows reasonable utilization of the
428     * tile cache based on the per-pixel cache footprint estimated
429     * above.
430     */
431    intel_calculate_tile_dimensions(device->info, cmd_buffer->state.current_l3_config,
432                                    32, 32, fb_width, fb_height,
433                                    pixel_size, tile_width, tile_height);
434 
435    /* Perform TBIMR tile passes only if the framebuffer covers more
436     * than a single tile.
437     */
438    return *tile_width < fb_width || *tile_height < fb_height;
439 }
440 #endif
441 
442 /**
443  * This function takes the vulkan runtime values & dirty states and updates
444  * the values in anv_gfx_dynamic_state, flagging HW instructions for
445  * reemission if the values are changing.
446  *
447  * Nothing is emitted in the batch buffer.
448  */
449 void
genX(cmd_buffer_flush_gfx_runtime_state)450 genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
451 {
452    UNUSED struct anv_device *device = cmd_buffer->device;
453    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
454    const struct anv_graphics_pipeline *pipeline =
455       anv_pipeline_to_graphics(gfx->base.pipeline);
456    const struct vk_dynamic_graphics_state *dyn =
457       &cmd_buffer->vk.dynamic_graphics_state;
458    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
459    struct anv_instance *instance = cmd_buffer->device->physical->instance;
460 
461 #define GET(field) hw_state->field
462 #define SET(bit, field, value)                               \
463    do {                                                      \
464       __typeof(hw_state->field) __v = value;                 \
465       if (hw_state->field != __v) {                          \
466          hw_state->field = __v;                              \
467          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);   \
468       }                                                      \
469    } while (0)
470 #define SET_STAGE(bit, field, value, stage)                  \
471    do {                                                      \
472       __typeof(hw_state->field) __v = value;                 \
473       if (!anv_pipeline_has_stage(pipeline,                  \
474                                   MESA_SHADER_##stage)) {    \
475          hw_state->field = __v;                              \
476          break;                                              \
477       }                                                      \
478       if (hw_state->field != __v) {                          \
479          hw_state->field = __v;                              \
480          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);   \
481       }                                                      \
482    } while (0)
483 
484 #define SETUP_PROVOKING_VERTEX(bit, cmd, mode)                         \
485    switch (mode) {                                                     \
486    case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:                     \
487       SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0);         \
488       SET(bit, cmd.LineStripListProvokingVertexSelect,     0);         \
489       SET(bit, cmd.TriangleFanProvokingVertexSelect,       1);         \
490       break;                                                           \
491    case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:                      \
492       SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2);         \
493       SET(bit, cmd.LineStripListProvokingVertexSelect,     1);         \
494       SET(bit, cmd.TriangleFanProvokingVertexSelect,       2);         \
495       break;                                                           \
496    default:                                                            \
497       unreachable("Invalid provoking vertex mode");                    \
498    }                                                                   \
499 
500    if ((gfx->dirty & (ANV_CMD_DIRTY_PIPELINE |
501                       ANV_CMD_DIRTY_XFB_ENABLE |
502                       ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) ||
503        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
504        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM) ||
505        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
506       SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable);
507       SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream);
508 
509 #if INTEL_NEEDS_WA_18022508906
510       /* Wa_18022508906 :
511        *
512        * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
513        *
514        * SOL_INT::Render_Enable =
515        *   (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
516        *   (
517        *     (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
518        *     !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
519        *     !3DSTATE_STREAMOUT::API_Render_Disable &&
520        *     (
521        *       3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
522        *       3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
523        *       3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
524        *       3DSTATE_PS_EXTRA::PS_Valid ||
525        *       3DSTATE_WM::Legacy Depth_Buffer_Clear ||
526        *       3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
527        *       3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
528        *     )
529        *   )
530        *
531        * If SOL_INT::Render_Enable is false, the SO stage will not forward any
532        * topologies down the pipeline. Which is not what we want for occlusion
533        * queries.
534        *
535        * Here we force rendering to get SOL_INT::Render_Enable when occlusion
536        * queries are active.
537        */
538       SET(STREAMOUT, so.ForceRendering,
539           (!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ?
540           Force_on : 0);
541 #endif
542 
543       switch (dyn->rs.provoking_vertex) {
544       case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
545          SET(STREAMOUT, so.ReorderMode, LEADING);
546          SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY);
547          break;
548 
549       case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
550          SET(STREAMOUT, so.ReorderMode, TRAILING);
551          SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY);
552          break;
553 
554       default:
555          unreachable("Invalid provoking vertex mode");
556       }
557    }
558 
559    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
560        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
561       uint32_t topology;
562       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
563          topology = _3DPRIM_PATCHLIST(dyn->ts.patch_control_points);
564       else
565          topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
566 
567       gfx->primitive_topology = topology;
568 
569       SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology);
570    }
571 
572    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
573        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
574        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
575        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
576       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
577 
578 #if GFX_VER >= 11
579    if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate &&
580        (gfx->dirty & ANV_CMD_DIRTY_PIPELINE ||
581         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))) {
582       const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
583       const bool cps_enable = wm_prog_data &&
584          brw_wm_prog_data_is_coarse(wm_prog_data, pipeline->fs_msaa_flags);
585 #if GFX_VER == 11
586       SET(CPS, cps.CoarsePixelShadingMode,
587                cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE);
588       SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width);
589       SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height);
590 #elif GFX_VER >= 12
591       SET(CPS, cps.CoarsePixelShadingStateArrayPointer,
592                get_cps_state_offset(device, cps_enable, &dyn->fsr));
593 #endif
594    }
595 #endif /* GFX_VER >= 11 */
596 
597    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
598        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
599       const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
600 
601       if (tes_prog_data && anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
602          if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
603             SET(TE, te.OutputTopology, tes_prog_data->output_topology);
604          } else {
605             /* When the origin is upper-left, we have to flip the winding order */
606             if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
607                SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
608             } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
609                SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
610             } else {
611                SET(TE, te.OutputTopology, tes_prog_data->output_topology);
612             }
613          }
614       } else {
615          SET(TE, te.OutputTopology, OUTPUT_POINT);
616       }
617    }
618 
619    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
620       SET(SF, sf.LineWidth, dyn->rs.line.width);
621 
622    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
623       SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
624       SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
625    }
626 
627    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
628       /**
629        * From the Vulkan Spec:
630        *
631        *    "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth
632        *     bias representation is a factor of constant r equal to 1."
633        *
634        * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset:
635        *
636        *    "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer):
637        *
638        *     Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope
639        *
640        *     Where r is the minimum representable value > 0 in the depth
641        *     buffer format, converted to float32 (note: If state bit Legacy
642        *     Global Depth Bias Enable is set, the r term will be forced to
643        *     1.0)"
644        *
645        * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable
646        * LegacyGlobalDepthBiasEnable.
647        */
648       SET(SF, sf.LegacyGlobalDepthBiasEnable,
649           dyn->rs.depth_bias.representation ==
650           VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT);
651    }
652 
653    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE))
654       SET(CLIP, clip.APIMode, dyn->vp.depth_clip_negative_one_to_one ? APIMODE_OGL : APIMODE_D3D);
655 
656    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
657        (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
658        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
659        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
660        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
661        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
662        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
663        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) ||
664        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) ||
665        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
666        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
667        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
668        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE)) {
669       /* Take dynamic primitive topology in to account with
670        *    3DSTATE_RASTER::APIMode
671        *    3DSTATE_RASTER::DXMultisampleRasterizationEnable
672        *    3DSTATE_RASTER::AntialiasingEnable
673        */
674       uint32_t api_mode = 0;
675       bool msaa_raster_enable = false;
676 
677       const VkLineRasterizationModeKHR line_mode =
678          anv_line_rasterization_mode(dyn->rs.line.mode,
679                                      pipeline->rasterization_samples);
680 
681       const VkPolygonMode dynamic_raster_mode =
682          genX(raster_polygon_mode)(pipeline,
683                                    dyn->rs.polygon_mode,
684                                    dyn->ia.primitive_topology);
685 
686       genX(rasterization_mode)(dynamic_raster_mode,
687                                line_mode, dyn->rs.line.width,
688                                &api_mode, &msaa_raster_enable);
689 
690      /* From the Browadwell PRM, Volume 2, documentation for
691       * 3DSTATE_RASTER, "Antialiasing Enable":
692       *
693       * "This field must be disabled if any of the render targets
694       * have integer (UINT or SINT) surface format."
695       *
696       * Additionally internal documentation for Gfx12+ states:
697       *
698       * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
699       *  FORCED_SAMPLE_COUNT > 1."
700       */
701       const bool aa_enable =
702          anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) &&
703          !gfx->has_uint_rt &&
704          !(GFX_VER >= 12 && gfx->samples > 1);
705 
706       const bool depth_clip_enable =
707          vk_rasterization_state_depth_clip_enable(&dyn->rs);
708 
709       const bool xy_clip_test_enable =
710          (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
711 
712       SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable);
713 
714       SET(RASTER, raster.APIMode, api_mode);
715       SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable);
716       SET(RASTER, raster.AntialiasingEnable, aa_enable);
717       SET(RASTER, raster.CullMode, genX(vk_to_intel_cullmode)[dyn->rs.cull_mode]);
718       SET(RASTER, raster.FrontWinding, genX(vk_to_intel_front_face)[dyn->rs.front_face]);
719       SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable);
720       SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable);
721       SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable);
722       SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant);
723       SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope);
724       SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp);
725       SET(RASTER, raster.FrontFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
726       SET(RASTER, raster.BackFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
727       SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable);
728       SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable);
729       SET(RASTER, raster.ConservativeRasterizationEnable,
730                   dyn->rs.conservative_mode !=
731                   VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
732    }
733 
734    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
735       /* From the Vulkan 1.0 spec:
736        *    If pSampleMask is NULL, it is treated as if the mask has all bits
737        *    enabled, i.e. no coverage is removed from fragments.
738        *
739        * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
740        */
741       SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff);
742    }
743 
744    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
745 #if GFX_VER == 9
746        /* For the PMA fix */
747        (gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
748 #endif
749        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
750        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
751        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
752        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
753        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
754        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
755        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
756        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
757       VkImageAspectFlags ds_aspects = 0;
758       if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED)
759          ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
760       if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED)
761          ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
762 
763       struct vk_depth_stencil_state opt_ds = dyn->ds;
764       vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
765 
766       SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true);
767 
768       SET(WM_DEPTH_STENCIL, ds.StencilTestMask,
769                             opt_ds.stencil.front.compare_mask & 0xff);
770       SET(WM_DEPTH_STENCIL, ds.StencilWriteMask,
771                             opt_ds.stencil.front.write_mask & 0xff);
772 
773       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
774       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);
775 
776       SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue,
777                             opt_ds.stencil.front.reference & 0xff);
778       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue,
779                             opt_ds.stencil.back.reference & 0xff);
780 
781       SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable);
782       SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
783       SET(WM_DEPTH_STENCIL, ds.DepthTestFunction,
784                             genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]);
785       SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable);
786       SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable, opt_ds.stencil.write_enable);
787       SET(WM_DEPTH_STENCIL, ds.StencilFailOp,
788                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]);
789       SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp,
790                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]);
791       SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp,
792                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]);
793       SET(WM_DEPTH_STENCIL, ds.StencilTestFunction,
794                             genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]);
795       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp,
796                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]);
797       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp,
798                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]);
799       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp,
800                             genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]);
801       SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction,
802                             genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]);
803 
804 #if GFX_VER == 9
805       const bool pma = want_stencil_pma_fix(cmd_buffer, dyn, &opt_ds);
806       SET(PMA_FIX, pma_fix, pma);
807 #endif
808 
809 #if GFX_VERx10 >= 125
810       if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) {
811          bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable;
812          if (gfx->ds_write_state != ds_write_state) {
813             gfx->ds_write_state = ds_write_state;
814             BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WA_18019816803);
815          }
816       }
817 #endif
818    }
819 
820 #if GFX_VER >= 12
821    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
822        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
823       SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable);
824       /* Only look at updating the bounds if testing is enabled */
825       if (dyn->ds.depth.bounds_test.enable) {
826          SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min);
827          SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max);
828       }
829    }
830 #endif
831 
832    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) ||
833        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE)) {
834       SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern);
835       SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount,
836                         1.0f / MAX2(1, dyn->rs.line.stipple.factor));
837       SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor);
838 
839       SET(WM,           wm.LineStippleEnable, dyn->rs.line.stipple.enable);
840    }
841 
842    if ((gfx->dirty & ANV_CMD_DIRTY_RESTART_INDEX) ||
843        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
844       SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable);
845       SET(VF, vf.CutIndex, gfx->restart_index);
846    }
847 
848    if (gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER)
849       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER);
850 
851 #if GFX_VERx10 >= 125
852    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
853       SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable);
854 #endif
855 
856    if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
857        (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
858         BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)))
859       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN);
860 
861    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
862        (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
863        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
864       /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
865        * threads.
866        */
867       bool force_thread_dispatch =
868          anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
869          (pipeline->force_fragment_thread_dispatch ||
870           anv_cmd_buffer_all_color_write_masked(cmd_buffer));
871       SET(WM, wm.ForceThreadDispatchEnable, force_thread_dispatch ? ForceON : 0);
872    }
873 
874    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
875        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) {
876       const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
877 
878       SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
879                 wm_prog_data && (pipeline->rp_has_ds_self_dep ||
880                                  has_ds_feedback_loop(dyn) ||
881                                  wm_prog_data->uses_kill),
882                 FRAGMENT);
883    }
884 
885    if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
886        (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
887        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
888        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
889        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) ||
890        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) ||
891        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
892        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) ||
893        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
894       const uint8_t color_writes = dyn->cb.color_write_enables;
895       const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
896       bool has_writeable_rt =
897          anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
898          (color_writes & ((1u << gfx->color_att_count) - 1)) != 0;
899 
900       SET(BLEND_STATE, blend.AlphaToCoverageEnable,
901                        dyn->ms.alpha_to_coverage_enable);
902       SET(BLEND_STATE, blend.AlphaToOneEnable,
903                        dyn->ms.alpha_to_one_enable);
904 
905       bool independent_alpha_blend = false;
906       /* Wa_14018912822, check if we set these during RT setup. */
907       bool color_blend_zero = false;
908       bool alpha_blend_zero = false;
909       for (uint32_t i = 0; i < MAX_RTS; i++) {
910          /* Disable anything above the current number of color attachments. */
911          bool write_disabled = i >= gfx->color_att_count ||
912                                (color_writes & BITFIELD_BIT(i)) == 0;
913 
914          SET(BLEND_STATE, blend.rts[i].WriteDisableAlpha,
915                           write_disabled ||
916                           (dyn->cb.attachments[i].write_mask &
917                            VK_COLOR_COMPONENT_A_BIT) == 0);
918          SET(BLEND_STATE, blend.rts[i].WriteDisableRed,
919                           write_disabled ||
920                           (dyn->cb.attachments[i].write_mask &
921                            VK_COLOR_COMPONENT_R_BIT) == 0);
922          SET(BLEND_STATE, blend.rts[i].WriteDisableGreen,
923                           write_disabled ||
924                           (dyn->cb.attachments[i].write_mask &
925                            VK_COLOR_COMPONENT_G_BIT) == 0);
926          SET(BLEND_STATE, blend.rts[i].WriteDisableBlue,
927                           write_disabled ||
928                           (dyn->cb.attachments[i].write_mask &
929                            VK_COLOR_COMPONENT_B_BIT) == 0);
930          /* Vulkan specification 1.2.168, VkLogicOp:
931           *
932           *   "Logical operations are controlled by the logicOpEnable and
933           *   logicOp members of VkPipelineColorBlendStateCreateInfo. If
934           *   logicOpEnable is VK_TRUE, then a logical operation selected by
935           *   logicOp is applied between each color attachment and the
936           *   fragment’s corresponding output value, and blending of all
937           *   attachments is treated as if it were disabled."
938           *
939           * From the Broadwell PRM Volume 2d: Command Reference: Structures:
940           * BLEND_STATE_ENTRY:
941           *
942           *   "Enabling LogicOp and Color Buffer Blending at the same time is
943           *   UNDEFINED"
944           */
945          SET(BLEND_STATE, blend.rts[i].LogicOpFunction,
946                           genX(vk_to_intel_logic_op)[dyn->cb.logic_op]);
947          SET(BLEND_STATE, blend.rts[i].LogicOpEnable, dyn->cb.logic_op_enable);
948 
949          SET(BLEND_STATE, blend.rts[i].ColorClampRange, COLORCLAMP_RTFORMAT);
950          SET(BLEND_STATE, blend.rts[i].PreBlendColorClampEnable, true);
951          SET(BLEND_STATE, blend.rts[i].PostBlendColorClampEnable, true);
952 
953          /* Setup blend equation. */
954          SET(BLEND_STATE, blend.rts[i].ColorBlendFunction,
955                           genX(vk_to_intel_blend_op)[
956                              dyn->cb.attachments[i].color_blend_op]);
957          SET(BLEND_STATE, blend.rts[i].AlphaBlendFunction,
958                           genX(vk_to_intel_blend_op)[
959                              dyn->cb.attachments[i].alpha_blend_op]);
960 
961          if (dyn->cb.attachments[i].src_color_blend_factor !=
962              dyn->cb.attachments[i].src_alpha_blend_factor ||
963              dyn->cb.attachments[i].dst_color_blend_factor !=
964              dyn->cb.attachments[i].dst_alpha_blend_factor ||
965              dyn->cb.attachments[i].color_blend_op !=
966              dyn->cb.attachments[i].alpha_blend_op) {
967             independent_alpha_blend = true;
968          }
969 
970          /* The Dual Source Blending documentation says:
971           *
972           * "If SRC1 is included in a src/dst blend factor and
973           * a DualSource RT Write message is not used, results
974           * are UNDEFINED. (This reflects the same restriction in DX APIs,
975           * where undefined results are produced if “o1” is not written
976           * by a PS – there are no default values defined)."
977           *
978           * There is no way to gracefully fix this undefined situation
979           * so we just disable the blending to prevent possible issues.
980           */
981          if (wm_prog_data && !wm_prog_data->dual_src_blend &&
982              anv_is_dual_src_blend_equation(&dyn->cb.attachments[i])) {
983             SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable, false);
984          } else {
985             SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable,
986                              !dyn->cb.logic_op_enable &&
987                              dyn->cb.attachments[i].blend_enable);
988          }
989 
990          /* Our hardware applies the blend factor prior to the blend function
991           * regardless of what function is used.  Technically, this means the
992           * hardware can do MORE than GL or Vulkan specify.  However, it also
993           * means that, for MIN and MAX, we have to stomp the blend factor to
994           * ONE to make it a no-op.
995           */
996          uint32_t SourceBlendFactor;
997          uint32_t DestinationBlendFactor;
998          uint32_t SourceAlphaBlendFactor;
999          uint32_t DestinationAlphaBlendFactor;
1000          if (dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MIN ||
1001              dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MAX) {
1002             SourceBlendFactor = BLENDFACTOR_ONE;
1003             DestinationBlendFactor = BLENDFACTOR_ONE;
1004          } else {
1005             SourceBlendFactor = genX(vk_to_intel_blend)[
1006                dyn->cb.attachments[i].src_color_blend_factor];
1007             DestinationBlendFactor = genX(vk_to_intel_blend)[
1008                dyn->cb.attachments[i].dst_color_blend_factor];
1009          }
1010 
1011          if (dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MIN ||
1012              dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MAX) {
1013             SourceAlphaBlendFactor = BLENDFACTOR_ONE;
1014             DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
1015          } else {
1016             SourceAlphaBlendFactor = genX(vk_to_intel_blend)[
1017                dyn->cb.attachments[i].src_alpha_blend_factor];
1018             DestinationAlphaBlendFactor = genX(vk_to_intel_blend)[
1019                dyn->cb.attachments[i].dst_alpha_blend_factor];
1020          }
1021 
1022          if (instance->intel_enable_wa_14018912822 &&
1023              intel_needs_workaround(cmd_buffer->device->info, 14018912822) &&
1024              pipeline->rasterization_samples > 1) {
1025             if (DestinationBlendFactor == BLENDFACTOR_ZERO) {
1026                DestinationBlendFactor = BLENDFACTOR_CONST_COLOR;
1027                color_blend_zero = true;
1028             }
1029             if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) {
1030                DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA;
1031                alpha_blend_zero = true;
1032             }
1033          }
1034 
1035          SET(BLEND_STATE, blend.rts[i].SourceBlendFactor, SourceBlendFactor);
1036          SET(BLEND_STATE, blend.rts[i].DestinationBlendFactor, DestinationBlendFactor);
1037          SET(BLEND_STATE, blend.rts[i].SourceAlphaBlendFactor, SourceAlphaBlendFactor);
1038          SET(BLEND_STATE, blend.rts[i].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor);
1039       }
1040       gfx->color_blend_zero = color_blend_zero;
1041       gfx->alpha_blend_zero = alpha_blend_zero;
1042 
1043       SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend);
1044 
1045       /* 3DSTATE_PS_BLEND to be consistent with the rest of the
1046        * BLEND_STATE_ENTRY.
1047        */
1048       SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt);
1049       SET(PS_BLEND, ps_blend.ColorBufferBlendEnable, GET(blend.rts[0].ColorBufferBlendEnable));
1050       SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor, GET(blend.rts[0].SourceAlphaBlendFactor));
1051       SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor, gfx->alpha_blend_zero ?
1052                                                           BLENDFACTOR_CONST_COLOR :
1053                                                           GET(blend.rts[0].DestinationAlphaBlendFactor));
1054       SET(PS_BLEND, ps_blend.SourceBlendFactor, GET(blend.rts[0].SourceBlendFactor));
1055       SET(PS_BLEND, ps_blend.DestinationBlendFactor, gfx->color_blend_zero ?
1056                                                      BLENDFACTOR_CONST_COLOR :
1057                                                      GET(blend.rts[0].DestinationBlendFactor));
1058       SET(PS_BLEND, ps_blend.AlphaTestEnable, false);
1059       SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable, GET(blend.IndependentAlphaBlendEnable));
1060       SET(PS_BLEND, ps_blend.AlphaToCoverageEnable, dyn->ms.alpha_to_coverage_enable);
1061    }
1062 
1063    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
1064       SET(CC_STATE, cc.BlendConstantColorRed,
1065                     gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]);
1066       SET(CC_STATE, cc.BlendConstantColorGreen,
1067                     gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]);
1068       SET(CC_STATE, cc.BlendConstantColorBlue,
1069                     gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]);
1070       SET(CC_STATE, cc.BlendConstantColorAlpha,
1071                     gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]);
1072    }
1073 
1074    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
1075        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1076        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
1077        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
1078        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1079       struct anv_instance *instance = cmd_buffer->device->physical->instance;
1080       const VkViewport *viewports = dyn->vp.viewports;
1081 
1082       const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f;
1083 
1084       for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1085          const VkViewport *vp = &viewports[i];
1086 
1087          /* The gfx7 state struct has just the matrix and guardband fields, the
1088           * gfx8 struct adds the min/max viewport fields. */
1089          struct GENX(SF_CLIP_VIEWPORT) sfv = {
1090             .ViewportMatrixElementm00 = vp->width / 2,
1091             .ViewportMatrixElementm11 = vp->height / 2,
1092             .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
1093             .ViewportMatrixElementm30 = vp->x + vp->width / 2,
1094             .ViewportMatrixElementm31 = vp->y + vp->height / 2,
1095             .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ?
1096                (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
1097             .XMinClipGuardband = -1.0f,
1098             .XMaxClipGuardband = 1.0f,
1099             .YMinClipGuardband = -1.0f,
1100             .YMaxClipGuardband = 1.0f,
1101             .XMinViewPort = vp->x,
1102             .XMaxViewPort = vp->x + vp->width - 1,
1103             .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
1104             .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
1105          };
1106 
1107          /* Fix depth test misrenderings by lowering translated depth range */
1108          if (instance->lower_depth_range_rate != 1.0f)
1109             sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
1110 
1111          const uint32_t fb_size_max = 1 << 14;
1112          uint32_t x_min = 0, x_max = fb_size_max;
1113          uint32_t y_min = 0, y_max = fb_size_max;
1114 
1115          /* If we have a valid renderArea, include that */
1116          if (gfx->render_area.extent.width > 0 &&
1117              gfx->render_area.extent.height > 0) {
1118             x_min = MAX2(x_min, gfx->render_area.offset.x);
1119             x_max = MIN2(x_max, gfx->render_area.offset.x +
1120                                 gfx->render_area.extent.width);
1121             y_min = MAX2(y_min, gfx->render_area.offset.y);
1122             y_max = MIN2(y_max, gfx->render_area.offset.y +
1123                                 gfx->render_area.extent.height);
1124          }
1125 
1126          /* The client is required to have enough scissors for whatever it
1127           * sets as ViewportIndex but it's possible that they've got more
1128           * viewports set from a previous command. Also, from the Vulkan
1129           * 1.3.207:
1130           *
1131           *    "The application must ensure (using scissor if necessary) that
1132           *    all rendering is contained within the render area."
1133           *
1134           * If the client doesn't set a scissor, that basically means it
1135           * guarantees everything is in-bounds already. If we end up using a
1136           * guardband of [-1, 1] in that case, there shouldn't be much loss.
1137           * It's theoretically possible that they could do all their clipping
1138           * with clip planes but that'd be a bit odd.
1139           */
1140          if (i < dyn->vp.scissor_count) {
1141             const VkRect2D *scissor = &dyn->vp.scissors[i];
1142             x_min = MAX2(x_min, scissor->offset.x);
1143             x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width);
1144             y_min = MAX2(y_min, scissor->offset.y);
1145             y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height);
1146          }
1147 
1148          /* Only bother calculating the guardband if our known render area is
1149           * less than the maximum size. Otherwise, it will calculate [-1, 1]
1150           * anyway but possibly with precision loss.
1151           */
1152          if (x_min > 0 || x_max < fb_size_max ||
1153              y_min > 0 || y_max < fb_size_max) {
1154             intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
1155                                            sfv.ViewportMatrixElementm00,
1156                                            sfv.ViewportMatrixElementm11,
1157                                            sfv.ViewportMatrixElementm30,
1158                                            sfv.ViewportMatrixElementm31,
1159                                            &sfv.XMinClipGuardband,
1160                                            &sfv.XMaxClipGuardband,
1161                                            &sfv.YMinClipGuardband,
1162                                            &sfv.YMaxClipGuardband);
1163          }
1164 
1165 #define SET_VP(bit, state, field)                                        \
1166          do {                                                           \
1167             if (hw_state->state.field != sfv.field) {                   \
1168                hw_state->state.field = sfv.field;                       \
1169                BITSET_SET(hw_state->dirty,                              \
1170                           ANV_GFX_STATE_##bit);                         \
1171             }                                                           \
1172          } while (0)
1173          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00);
1174          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11);
1175          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22);
1176          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30);
1177          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31);
1178          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32);
1179          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband);
1180          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband);
1181          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband);
1182          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband);
1183          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort);
1184          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort);
1185          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort);
1186          SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort);
1187 #undef SET_VP
1188 
1189          const bool depth_range_unrestricted =
1190             cmd_buffer->device->vk.enabled_extensions.EXT_depth_range_unrestricted;
1191 
1192          float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0;
1193          float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0;
1194 
1195          float min_depth = dyn->rs.depth_clamp_enable ?
1196                            MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit;
1197          float max_depth = dyn->rs.depth_clamp_enable ?
1198                            MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit;
1199 
1200          SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth);
1201          SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth);
1202 
1203          SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ?
1204                                         dyn->vp.viewport_count - 1 : 0);
1205       }
1206 
1207       /* If the HW state is already considered dirty or the previous
1208        * programmed viewport count is smaller than what we need, update the
1209        * viewport count and ensure the HW state is dirty. Otherwise if the
1210        * number of viewport programmed previously was larger than what we need
1211        * now, no need to reemit we can just keep the old programmed values.
1212        */
1213       if (BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
1214           hw_state->vp_sf_clip.count < dyn->vp.viewport_count) {
1215          hw_state->vp_sf_clip.count = dyn->vp.viewport_count;
1216          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
1217       }
1218       if (BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
1219           hw_state->vp_cc.count < dyn->vp.viewport_count) {
1220          hw_state->vp_cc.count = dyn->vp.viewport_count;
1221          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
1222       }
1223    }
1224 
1225    if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
1226        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
1227        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
1228       const VkRect2D *scissors = dyn->vp.scissors;
1229       const VkViewport *viewports = dyn->vp.viewports;
1230 
1231       for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) {
1232          const VkRect2D *s = &scissors[i];
1233          const VkViewport *vp = &viewports[i];
1234 
1235          const int max = 0xffff;
1236 
1237          uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
1238          uint32_t x_min = MAX2(s->offset.x, vp->x);
1239          int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
1240                               MAX2(vp->y, vp->y + vp->height) - 1);
1241          int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
1242                               vp->x + vp->width - 1);
1243 
1244          y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
1245          x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
1246 
1247          /* Do this math using int64_t so overflow gets clamped correctly. */
1248          if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1249             y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
1250             x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
1251             y_max = CLAMP((uint64_t) y_max, 0,
1252                           gfx->render_area.offset.y +
1253                           gfx->render_area.extent.height - 1);
1254             x_max = CLAMP((uint64_t) x_max, 0,
1255                           gfx->render_area.offset.x +
1256                           gfx->render_area.extent.width - 1);
1257          }
1258 
1259          if (s->extent.width <= 0 || s->extent.height <= 0) {
1260             /* Since xmax and ymax are inclusive, we have to have xmax < xmin
1261              * or ymax < ymin for empty clips. In case clip x, y, width height
1262              * are all 0, the clamps below produce 0 for xmin, ymin, xmax,
1263              * ymax, which isn't what we want. Just special case empty clips
1264              * and produce a canonical empty clip.
1265              */
1266             SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1);
1267             SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1);
1268             SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0);
1269             SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0);
1270          } else {
1271             SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min);
1272             SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min);
1273             SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max);
1274             SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max);
1275          }
1276       }
1277 
1278       /* If the HW state is already considered dirty or the previous
1279        * programmed viewport count is smaller than what we need, update the
1280        * viewport count and ensure the HW state is dirty. Otherwise if the
1281        * number of viewport programmed previously was larger than what we need
1282        * now, no need to reemit we can just keep the old programmed values.
1283        */
1284       if (BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR) ||
1285           hw_state->scissor.count < dyn->vp.scissor_count) {
1286          hw_state->scissor.count = dyn->vp.scissor_count;
1287          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR);
1288       }
1289    }
1290 
1291 #if GFX_VERx10 == 125
1292    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)) {
1293       unsigned fb_width, fb_height, tile_width, tile_height;
1294 
1295       if (cmd_buffer->device->physical->instance->enable_tbimr &&
1296           calculate_render_area(cmd_buffer, &fb_width, &fb_height) &&
1297           calculate_tile_dimensions(cmd_buffer, fb_width, fb_height,
1298                                     &tile_width, &tile_height)) {
1299          /* Use a batch size of 128 polygons per slice as recommended
1300           * by BSpec 68436 "TBIMR Programming".
1301           */
1302          const unsigned num_slices = cmd_buffer->device->info->num_slices;
1303          const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
1304 
1305          SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height);
1306          SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width);
1307          SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount,
1308              DIV_ROUND_UP(fb_height, tile_height));
1309          SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount,
1310              DIV_ROUND_UP(fb_width, tile_width));
1311          SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize,
1312              util_logbase2(batch_size) - 5);
1313          SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true);
1314          SET(TBIMR_TILE_PASS_INFO, use_tbimr, true);
1315       } else {
1316          hw_state->use_tbimr = false;
1317       }
1318    }
1319 #endif
1320 
1321 #undef GET
1322 #undef SET
1323 #undef SET_STAGE
1324 
1325    vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
1326 }
1327 
1328 static void
emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer * cmd_buffer)1329 emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer)
1330 {
1331 #if GFX_VERx10 >= 125
1332    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
1333       vfg.DistributionMode = RR_STRICT;
1334    }
1335    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
1336       vf.GeometryDistributionEnable = true;
1337    }
1338 #endif
1339 
1340 #if GFX_VER >= 12
1341    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
1342       pr.ReplicaMask = 1;
1343    }
1344 #endif
1345 
1346    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) {
1347       rr.CullMode = CULLMODE_NONE;
1348       rr.FrontFaceFillMode = FILL_MODE_SOLID;
1349       rr.BackFaceFillMode = FILL_MODE_SOLID;
1350    }
1351 
1352    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero);
1353    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero);
1354 
1355 #if GFX_VER >= 11
1356    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero);
1357 #endif
1358 
1359    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) {
1360       clip.ClipEnable = true;
1361       clip.ClipMode = CLIPMODE_REJECT_ALL;
1362    }
1363 
1364    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero);
1365    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero);
1366    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero);
1367    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero);
1368    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero);
1369    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero);
1370 
1371    uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2,
1372                                                GENX(3DSTATE_VERTEX_ELEMENTS));
1373    uint32_t *ve_pack_dest = &vertex_elements[1];
1374 
1375    for (int i = 0; i < 2; i++) {
1376       struct GENX(VERTEX_ELEMENT_STATE) element = {
1377          .Valid = true,
1378          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
1379          .Component0Control = VFCOMP_STORE_0,
1380          .Component1Control = VFCOMP_STORE_0,
1381          .Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
1382          .Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
1383       };
1384       GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element);
1385       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
1386    }
1387 
1388    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
1389       topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
1390    }
1391 
1392    /* Emit dummy draw per slice. */
1393    for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) {
1394       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1395          prim.VertexCountPerInstance = 3;
1396          prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
1397          prim.InstanceCount = 1;
1398          prim.VertexAccessType = SEQUENTIAL;
1399       }
1400    }
1401 }
1402 /**
1403  * This function handles dirty state emission to the batch buffer.
1404  */
1405 static void
cmd_buffer_gfx_state_emission(struct anv_cmd_buffer * cmd_buffer)1406 cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
1407 {
1408    struct anv_device *device = cmd_buffer->device;
1409    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1410    struct anv_graphics_pipeline *pipeline =
1411       anv_pipeline_to_graphics(gfx->base.pipeline);
1412    const struct vk_dynamic_graphics_state *dyn =
1413       &cmd_buffer->vk.dynamic_graphics_state;
1414    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
1415 
1416    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) {
1417       genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg);
1418 
1419       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb);
1420 
1421       memcpy(&gfx->urb_cfg, &pipeline->urb_cfg,
1422              sizeof(struct intel_urb_config));
1423    }
1424 
1425    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE))
1426       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ms);
1427 
1428    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION))
1429       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.primitive_replication);
1430 
1431    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_INSTANCING))
1432       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_instancing);
1433 
1434    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS))
1435       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs);
1436 
1437 #if GFX_VER >= 11
1438    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2))
1439       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_2);
1440 #endif
1441 
1442    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VS))
1443       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vs);
1444 
1445    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_HS))
1446       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
1447 
1448    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DS))
1449       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
1450 
1451    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS))
1452       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_statistics);
1453 
1454    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE))
1455       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe);
1456 
1457    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ))
1458       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_swiz);
1459 
1460    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
1461       /* Wa_16011773973:
1462        * If SOL is enabled and SO_DECL state has to be programmed,
1463        *    1. Send 3D State SOL state with SOL disabled
1464        *    2. Send SO_DECL NP state
1465        *    3. Send 3D State SOL with SOL Enabled
1466        */
1467       if (intel_needs_workaround(device->info, 16011773973) &&
1468           pipeline->uses_xfb)
1469          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so);
1470 
1471       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
1472                                     final.so_decl_list);
1473 
1474 #if GFX_VER >= 11
1475       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
1476        * 3DSTATE_SO_DECL_LIST:
1477        *
1478        *    "Workaround: This command must be followed by a PIPE_CONTROL with
1479        *     CS Stall bit set."
1480        *
1481        * On DG2+ also known as Wa_1509820217.
1482        */
1483       genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
1484                                    cmd_buffer->state.current_pipeline,
1485                                    ANV_PIPE_CS_STALL_BIT);
1486 #endif
1487    }
1488 
1489    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS))
1490       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ps);
1491 
1492    if (device->vk.enabled_extensions.EXT_mesh_shader) {
1493       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL))
1494          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_control);
1495 
1496       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER))
1497          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_shader);
1498 
1499       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB))
1500          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_distrib);
1501 
1502       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL))
1503          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_control);
1504 
1505       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER))
1506          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_shader);
1507 
1508       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB))
1509          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_redistrib);
1510 
1511       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH))
1512          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_mesh);
1513 
1514       if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH))
1515          anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.clip_mesh);
1516    } else {
1517       assert(!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL) &&
1518              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER) &&
1519              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB) &&
1520              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL) &&
1521              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER) &&
1522              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB) &&
1523              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH) &&
1524              !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH));
1525    }
1526 
1527 #define INIT(category, name) \
1528    .name = hw_state->category.name
1529 #define SET(s, category, name) \
1530    s.name = hw_state->category.name
1531 
1532    /* Now the potentially dynamic instructions */
1533 
1534    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA)) {
1535       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS_EXTRA),
1536                            pipeline, partial.ps_extra, pse) {
1537          SET(pse, ps_extra, PixelShaderKillsPixel);
1538       }
1539    }
1540 
1541    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP)) {
1542       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
1543                            pipeline, partial.clip, clip) {
1544          SET(clip, clip, APIMode);
1545          SET(clip, clip, ViewportXYClipTestEnable);
1546          SET(clip, clip, TriangleStripListProvokingVertexSelect);
1547          SET(clip, clip, LineStripListProvokingVertexSelect);
1548          SET(clip, clip, TriangleFanProvokingVertexSelect);
1549          SET(clip, clip, MaximumVPIndex);
1550       }
1551    }
1552 
1553    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_STREAMOUT)) {
1554       genX(streamout_prologue)(cmd_buffer);
1555 
1556       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
1557                            pipeline, partial.so, so) {
1558          SET(so, so, RenderingDisable);
1559          SET(so, so, RenderStreamSelect);
1560          SET(so, so, ReorderMode);
1561          SET(so, so, ForceRendering);
1562       }
1563    }
1564 
1565    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP)) {
1566       struct anv_state sf_clip_state =
1567          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1568                                             hw_state->vp_sf_clip.count * 64, 64);
1569 
1570       for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) {
1571          struct GENX(SF_CLIP_VIEWPORT) sfv = {
1572             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00),
1573             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11),
1574             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22),
1575             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30),
1576             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31),
1577             INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32),
1578             INIT(vp_sf_clip.elem[i], XMinClipGuardband),
1579             INIT(vp_sf_clip.elem[i], XMaxClipGuardband),
1580             INIT(vp_sf_clip.elem[i], YMinClipGuardband),
1581             INIT(vp_sf_clip.elem[i], YMaxClipGuardband),
1582             INIT(vp_sf_clip.elem[i], XMinViewPort),
1583             INIT(vp_sf_clip.elem[i], XMaxViewPort),
1584             INIT(vp_sf_clip.elem[i], YMinViewPort),
1585             INIT(vp_sf_clip.elem[i], YMaxViewPort),
1586          };
1587          GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
1588       }
1589 
1590       anv_batch_emit(&cmd_buffer->batch,
1591                      GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
1592          clip.SFClipViewportPointer = sf_clip_state.offset;
1593       }
1594    }
1595 
1596    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC)) {
1597       struct anv_state cc_state =
1598          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1599                                             hw_state->vp_cc.count * 8, 32);
1600 
1601       for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) {
1602          struct GENX(CC_VIEWPORT) cc_viewport = {
1603             INIT(vp_cc.elem[i], MinimumDepth),
1604             INIT(vp_cc.elem[i], MaximumDepth),
1605          };
1606          GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
1607       }
1608 
1609       anv_batch_emit(&cmd_buffer->batch,
1610                      GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
1611          cc.CCViewportPointer = cc_state.offset;
1612       }
1613       cmd_buffer->state.gfx.viewport_set = true;
1614    }
1615 
1616    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR)) {
1617       /* Wa_1409725701:
1618        *
1619        *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
1620        *    stored as an array of up to 16 elements. The location of first
1621        *    element of the array, as specified by Pointer to SCISSOR_RECT,
1622        *    should be aligned to a 64-byte boundary.
1623        */
1624       struct anv_state scissor_state =
1625          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1626                                             hw_state->scissor.count * 8, 64);
1627 
1628       for (uint32_t i = 0; i < hw_state->scissor.count; i++) {
1629          struct GENX(SCISSOR_RECT) scissor = {
1630             INIT(scissor.elem[i], ScissorRectangleYMin),
1631             INIT(scissor.elem[i], ScissorRectangleXMin),
1632             INIT(scissor.elem[i], ScissorRectangleYMax),
1633             INIT(scissor.elem[i], ScissorRectangleXMax),
1634          };
1635          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
1636       }
1637 
1638       anv_batch_emit(&cmd_buffer->batch,
1639                      GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
1640          ssp.ScissorRectPointer = scissor_state.offset;
1641       }
1642    }
1643 
1644    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY)) {
1645       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
1646          SET(vft, vft, PrimitiveTopologyType);
1647       }
1648    }
1649 
1650    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT)) {
1651       const uint32_t ve_count =
1652          pipeline->vs_input_elements + pipeline->svgs_count;
1653       const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count);
1654       uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
1655                                     GENX(3DSTATE_VERTEX_ELEMENTS));
1656 
1657       if (p) {
1658          if (ve_count == 0) {
1659             memcpy(p + 1, cmd_buffer->device->empty_vs_input,
1660                    sizeof(cmd_buffer->device->empty_vs_input));
1661          } else if (ve_count == pipeline->vertex_input_elems) {
1662             /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so
1663              * everything is in pipeline->vertex_input_data and we can just
1664              * memcpy
1665              */
1666             memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count);
1667             anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
1668                                           final.vf_instancing);
1669          } else {
1670             assert(pipeline->final.vf_instancing.len == 0);
1671             /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
1672             genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
1673                                     pipeline, dyn->vi, false /* emit_in_pipeline */);
1674             /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
1675             memcpy(p + 1 + 2 * pipeline->vs_input_elements,
1676                    pipeline->vertex_input_data,
1677                    4 * 2 * pipeline->vertex_input_elems);
1678          }
1679       }
1680    }
1681 
1682    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TE)) {
1683       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
1684                            pipeline, partial.te, te) {
1685          SET(te, te, OutputTopology);
1686       }
1687    }
1688 
1689    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_GS)) {
1690       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS),
1691                            pipeline, partial.gs, gs) {
1692          SET(gs, gs, ReorderMode);
1693       }
1694    }
1695 
1696    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CPS)) {
1697 #if GFX_VER == 11
1698       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS), cps) {
1699          SET(cps, cps, CoarsePixelShadingMode);
1700          SET(cps, cps, MinCPSizeX);
1701          SET(cps, cps, MinCPSizeY);
1702       }
1703 #elif GFX_VER >= 12
1704       /* TODO: we can optimize this flush in the following cases:
1705        *
1706        *    In the case where the last geometry shader emits a value that is
1707        *    not constant, we can avoid this stall because we can synchronize
1708        *    the pixel shader internally with
1709        *    3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
1710        *
1711        *    If we know that the previous pipeline and the current one are
1712        *    using the same fragment shading rate.
1713        */
1714       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1715 #if GFX_VERx10 >= 125
1716          pc.PSSStallSyncEnable = true;
1717 #else
1718          pc.PSDSyncEnable = true;
1719 #endif
1720       }
1721 
1722       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS_POINTERS), cps) {
1723          SET(cps, cps, CoarsePixelShadingStateArrayPointer);
1724       }
1725 #endif
1726    }
1727 
1728    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SF)) {
1729       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
1730                            pipeline, partial.sf, sf) {
1731          SET(sf, sf, LineWidth);
1732          SET(sf, sf, TriangleStripListProvokingVertexSelect);
1733          SET(sf, sf, LineStripListProvokingVertexSelect);
1734          SET(sf, sf, TriangleFanProvokingVertexSelect);
1735          SET(sf, sf, LegacyGlobalDepthBiasEnable);
1736       }
1737    }
1738 
1739    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_RASTER)) {
1740       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
1741                            pipeline, partial.raster, raster) {
1742          SET(raster, raster, APIMode);
1743          SET(raster, raster, DXMultisampleRasterizationEnable);
1744          SET(raster, raster, AntialiasingEnable);
1745          SET(raster, raster, CullMode);
1746          SET(raster, raster, FrontWinding);
1747          SET(raster, raster, GlobalDepthOffsetEnableSolid);
1748          SET(raster, raster, GlobalDepthOffsetEnableWireframe);
1749          SET(raster, raster, GlobalDepthOffsetEnablePoint);
1750          SET(raster, raster, GlobalDepthOffsetConstant);
1751          SET(raster, raster, GlobalDepthOffsetScale);
1752          SET(raster, raster, GlobalDepthOffsetClamp);
1753          SET(raster, raster, FrontFaceFillMode);
1754          SET(raster, raster, BackFaceFillMode);
1755          SET(raster, raster, ViewportZFarClipTestEnable);
1756          SET(raster, raster, ViewportZNearClipTestEnable);
1757          SET(raster, raster, ConservativeRasterizationEnable);
1758       }
1759    }
1760 
1761    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE)) {
1762       struct anv_state cc_state =
1763          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1764                                             GENX(COLOR_CALC_STATE_length) * 4,
1765                                             64);
1766       struct GENX(COLOR_CALC_STATE) cc = {
1767          INIT(cc, BlendConstantColorRed),
1768          INIT(cc, BlendConstantColorGreen),
1769          INIT(cc, BlendConstantColorBlue),
1770          INIT(cc, BlendConstantColorAlpha),
1771       };
1772       GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
1773 
1774       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
1775          ccp.ColorCalcStatePointer = cc_state.offset;
1776          ccp.ColorCalcStatePointerValid = true;
1777       }
1778    }
1779 
1780    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK)) {
1781       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
1782          SET(sm, sm, SampleMask);
1783       }
1784    }
1785 
1786    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL)) {
1787       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
1788          SET(ds, ds, DoubleSidedStencilEnable);
1789          SET(ds, ds, StencilTestMask);
1790          SET(ds, ds, StencilWriteMask);
1791          SET(ds, ds, BackfaceStencilTestMask);
1792          SET(ds, ds, BackfaceStencilWriteMask);
1793          SET(ds, ds, StencilReferenceValue);
1794          SET(ds, ds, BackfaceStencilReferenceValue);
1795          SET(ds, ds, DepthTestEnable);
1796          SET(ds, ds, DepthBufferWriteEnable);
1797          SET(ds, ds, DepthTestFunction);
1798          SET(ds, ds, StencilTestEnable);
1799          SET(ds, ds, StencilBufferWriteEnable);
1800          SET(ds, ds, StencilFailOp);
1801          SET(ds, ds, StencilPassDepthPassOp);
1802          SET(ds, ds, StencilPassDepthFailOp);
1803          SET(ds, ds, StencilTestFunction);
1804          SET(ds, ds, BackfaceStencilFailOp);
1805          SET(ds, ds, BackfaceStencilPassDepthPassOp);
1806          SET(ds, ds, BackfaceStencilPassDepthFailOp);
1807          SET(ds, ds, BackfaceStencilTestFunction);
1808       }
1809    }
1810 
1811 #if GFX_VER >= 12
1812    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS)) {
1813       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
1814          SET(db, db, DepthBoundsTestEnable);
1815          SET(db, db, DepthBoundsTestMinValue);
1816          SET(db, db, DepthBoundsTestMaxValue);
1817       }
1818    }
1819 #endif
1820 
1821    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_LINE_STIPPLE)) {
1822       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
1823          SET(ls, ls, LineStipplePattern);
1824          SET(ls, ls, LineStippleInverseRepeatCount);
1825          SET(ls, ls, LineStippleRepeatCount);
1826       }
1827 #if GFX_VER >= 11
1828       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
1829        * 3DSTATE_LINE_STIPPLE:
1830        *
1831        *    "Workaround: This command must be followed by a PIPE_CONTROL with
1832        *     CS Stall bit set."
1833        */
1834       genx_batch_emit_pipe_control(&cmd_buffer->batch,
1835                                    cmd_buffer->device->info,
1836                                    cmd_buffer->state.current_pipeline,
1837                                    ANV_PIPE_CS_STALL_BIT);
1838 #endif
1839    }
1840 
1841    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF)) {
1842       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
1843 #if GFX_VERx10 >= 125
1844          vf.GeometryDistributionEnable = true;
1845 #endif
1846          SET(vf, vf, IndexedDrawCutIndexEnable);
1847          SET(vf, vf, CutIndex);
1848       }
1849    }
1850 
1851    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER)) {
1852       struct anv_buffer *buffer = gfx->index_buffer;
1853       uint32_t offset = gfx->index_offset;
1854       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
1855          ib.IndexFormat           = gfx->index_type;
1856          ib.MOCS                  = anv_mocs(cmd_buffer->device,
1857                                              buffer ? buffer->address.bo : NULL,
1858                                              ISL_SURF_USAGE_INDEX_BUFFER_BIT);
1859 #if GFX_VER >= 12
1860          ib.L3BypassDisable       = true;
1861 #endif
1862          if (buffer) {
1863             ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
1864             ib.BufferSize            = gfx->index_size;
1865          }
1866       }
1867    }
1868 
1869 #if GFX_VERx10 >= 125
1870    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VFG)) {
1871       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
1872                            pipeline, partial.vfg, vfg) {
1873          SET(vfg, vfg, ListCutIndexEnable);
1874       }
1875    }
1876 #endif
1877 
1878    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN)) {
1879       genX(emit_sample_pattern)(&cmd_buffer->batch,
1880                                 dyn->ms.sample_locations_enable ?
1881                                 dyn->ms.sample_locations : NULL);
1882    }
1883 
1884    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM)) {
1885       anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
1886                            pipeline, partial.wm, wm) {
1887          SET(wm, wm, ForceThreadDispatchEnable);
1888          SET(wm, wm, LineStippleEnable);
1889       }
1890    }
1891 
1892    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_BLEND)) {
1893       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS_BLEND), blend) {
1894          SET(blend, ps_blend, HasWriteableRT);
1895          SET(blend, ps_blend, ColorBufferBlendEnable);
1896          SET(blend, ps_blend, SourceAlphaBlendFactor);
1897          SET(blend, ps_blend, DestinationAlphaBlendFactor);
1898          SET(blend, ps_blend, SourceBlendFactor);
1899          SET(blend, ps_blend, DestinationBlendFactor);
1900          SET(blend, ps_blend, AlphaTestEnable);
1901          SET(blend, ps_blend, IndependentAlphaBlendEnable);
1902          SET(blend, ps_blend, AlphaToCoverageEnable);
1903       }
1904    }
1905 
1906    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE)) {
1907       const uint32_t num_dwords = GENX(BLEND_STATE_length) +
1908          GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
1909       struct anv_state blend_states =
1910          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
1911                                             num_dwords * 4,
1912                                             64);
1913 
1914       uint32_t *dws = blend_states.map;
1915 
1916       struct GENX(BLEND_STATE) blend_state = {
1917          INIT(blend, AlphaToCoverageEnable),
1918          INIT(blend, AlphaToOneEnable),
1919          INIT(blend, IndependentAlphaBlendEnable),
1920       };
1921       GENX(BLEND_STATE_pack)(NULL, blend_states.map, &blend_state);
1922 
1923       /* Jump to blend entries. */
1924       dws += GENX(BLEND_STATE_length);
1925       for (uint32_t i = 0; i < MAX_RTS; i++) {
1926          struct GENX(BLEND_STATE_ENTRY) entry = {
1927             INIT(blend.rts[i], WriteDisableAlpha),
1928             INIT(blend.rts[i], WriteDisableRed),
1929             INIT(blend.rts[i], WriteDisableGreen),
1930             INIT(blend.rts[i], WriteDisableBlue),
1931             INIT(blend.rts[i], LogicOpFunction),
1932             INIT(blend.rts[i], LogicOpEnable),
1933             INIT(blend.rts[i], ColorBufferBlendEnable),
1934             INIT(blend.rts[i], ColorClampRange),
1935             INIT(blend.rts[i], PreBlendColorClampEnable),
1936             INIT(blend.rts[i], PostBlendColorClampEnable),
1937             INIT(blend.rts[i], SourceBlendFactor),
1938             INIT(blend.rts[i], DestinationBlendFactor),
1939             INIT(blend.rts[i], ColorBlendFunction),
1940             INIT(blend.rts[i], SourceAlphaBlendFactor),
1941             INIT(blend.rts[i], DestinationAlphaBlendFactor),
1942             INIT(blend.rts[i], AlphaBlendFunction),
1943          };
1944 
1945          GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
1946          dws += GENX(BLEND_STATE_ENTRY_length);
1947       }
1948 
1949       gfx->blend_states = blend_states;
1950       /* Dirty the pointers to reemit 3DSTATE_BLEND_STATE_POINTERS below */
1951       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_POINTERS);
1952    }
1953 
1954    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_POINTERS)) {
1955       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
1956          bsp.BlendStatePointer      = gfx->blend_states.offset;
1957          bsp.BlendStatePointerValid = true;
1958       }
1959    }
1960 
1961 #if GFX_VERx10 >= 125
1962    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_18019816803)) {
1963       genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1964                                    cmd_buffer->state.current_pipeline,
1965                                    ANV_PIPE_PSS_STALL_SYNC_BIT);
1966    }
1967 #endif
1968 
1969 #if GFX_VER == 9
1970    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PMA_FIX))
1971       genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix);
1972 #endif
1973 
1974 #if GFX_VERx10 >= 125
1975    if (hw_state->use_tbimr &&
1976        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TBIMR_TILE_PASS_INFO)) {
1977       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO),
1978                      tbimr) {
1979          SET(tbimr, tbimr, TileRectangleHeight);
1980          SET(tbimr, tbimr, TileRectangleWidth);
1981          SET(tbimr, tbimr, VerticalTileCount);
1982          SET(tbimr, tbimr, HorizontalTileCount);
1983          SET(tbimr, tbimr, TBIMRBatchSize);
1984          SET(tbimr, tbimr, TileBoxCheck);
1985       }
1986    }
1987 #endif
1988 
1989 #undef INIT
1990 #undef SET
1991 
1992    BITSET_ZERO(hw_state->dirty);
1993 }
1994 
1995 /**
1996  * This function handles possible state workarounds and emits the dirty
1997  * instructions to the batch buffer.
1998  */
1999 void
genX(cmd_buffer_flush_gfx_hw_state)2000 genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
2001 {
2002    struct anv_device *device = cmd_buffer->device;
2003    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2004    struct anv_graphics_pipeline *pipeline =
2005       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2006    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
2007 
2008    if (INTEL_DEBUG(DEBUG_REEMIT)) {
2009       BITSET_OR(gfx->dyn_state.dirty, gfx->dyn_state.dirty,
2010                 device->gfx_dirty_state);
2011    }
2012 
2013    /**
2014     * Put potential workarounds here if you need to reemit an instruction
2015     * because of another one is changing.
2016     */
2017 
2018    /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit
2019     * it after.
2020     */
2021    if (intel_needs_workaround(device->info, 16011773973) &&
2022        pipeline->uses_xfb &&
2023        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
2024       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2025    }
2026 
2027    /* Gfx11 undocumented issue :
2028     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781
2029     */
2030 #if GFX_VER == 11
2031    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM))
2032       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
2033 #endif
2034 
2035    /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
2036    if (intel_needs_workaround(device->info, 18020335297) &&
2037        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) &&
2038        cmd_buffer->state.gfx.viewport_set) {
2039       /* For mesh, we implement the WA using CS stall. This is for
2040        * simplicity and takes care of possible interaction with Wa_16014390852.
2041        */
2042       if (anv_pipeline_is_mesh(pipeline)) {
2043          genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
2044                                       _3D, ANV_PIPE_CS_STALL_BIT);
2045       } else {
2046          /* Mask off all instructions that we program. */
2047          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VFG);
2048          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF);
2049          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
2050          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_RASTER);
2051          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2052          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
2053          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
2054          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_CLIP);
2055          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2056          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
2057          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
2058 
2059          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VS);
2060          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_GS);
2061          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_HS);
2062          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_TE);
2063          BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_DS);
2064 
2065          cmd_buffer_gfx_state_emission(cmd_buffer);
2066 
2067          emit_wa_18020335297_dummy_draw(cmd_buffer);
2068 
2069          /* Dirty all emitted WA state to make sure that current real
2070           * state is restored.
2071           */
2072          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VFG);
2073          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
2074          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
2075          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
2076          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
2077          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
2078          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
2079          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
2080          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
2081          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
2082          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
2083 
2084          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
2085          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
2086          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
2087          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
2088          BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
2089       }
2090    }
2091 
2092    cmd_buffer_gfx_state_emission(cmd_buffer);
2093 }
2094 
2095 void
genX(cmd_buffer_enable_pma_fix)2096 genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
2097 {
2098    if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
2099       return;
2100 
2101    if (cmd_buffer->state.pma_fix_enabled == enable)
2102       return;
2103 
2104    cmd_buffer->state.pma_fix_enabled = enable;
2105 
2106    /* According to the Broadwell PIPE_CONTROL documentation, software should
2107     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2108     * prior to the LRI.  If stencil buffer writes are enabled, then a Render
2109     * Cache Flush is also necessary.
2110     *
2111     * The Skylake docs say to use a depth stall rather than a command
2112     * streamer stall.  However, the hardware seems to violently disagree.
2113     * A full command streamer stall seems to be needed in both cases.
2114     */
2115    genx_batch_emit_pipe_control
2116       (&cmd_buffer->batch, cmd_buffer->device->info,
2117        cmd_buffer->state.current_pipeline,
2118        ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
2119        ANV_PIPE_CS_STALL_BIT |
2120 #if GFX_VER >= 12
2121        ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2122 #endif
2123        ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2124 
2125 #if GFX_VER == 9
2126    uint32_t cache_mode;
2127    anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
2128                    .STCPMAOptimizationEnable = enable,
2129                    .STCPMAOptimizationEnableMask = true);
2130    anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2131       lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
2132       lri.DataDWord        = cache_mode;
2133    }
2134 
2135 #endif /* GFX_VER == 9 */
2136 
2137    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2138     * Flush bits is often necessary.  We do it regardless because it's easier.
2139     * The render cache flush is also necessary if stencil writes are enabled.
2140     *
2141     * Again, the Skylake docs give a different set of flushes but the BDW
2142     * flushes seem to work just as well.
2143     */
2144    genx_batch_emit_pipe_control
2145       (&cmd_buffer->batch, cmd_buffer->device->info,
2146        cmd_buffer->state.current_pipeline,
2147        ANV_PIPE_DEPTH_STALL_BIT |
2148        ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
2149 #if GFX_VER >= 12
2150        ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2151 #endif
2152        ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2153 }
2154