• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31 
32 #include "common/intel_aux_map.h"
33 #include "genxml/gen_macros.h"
34 #include "genxml/genX_pack.h"
35 #include "genxml/genX_rt_pack.h"
36 #include "common/intel_genX_state_brw.h"
37 
38 #include "ds/intel_tracepoints.h"
39 
40 /* We reserve :
41  *    - GPR 14 for secondary command buffer returns
42  *    - GPR 15 for conditional rendering
43  */
44 #define MI_BUILDER_NUM_ALLOC_GPRS 14
45 #define __gen_get_batch_dwords anv_batch_emit_dwords
46 #define __gen_address_offset anv_address_add
47 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
48 #include "common/mi_builder.h"
49 
50 #include "genX_cmd_draw_generated_flush.h"
51 
52 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
53                                         uint32_t pipeline);
54 
55 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)56 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
57    enum anv_pipe_bits bits = 0;
58    bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
59    bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
60 #if GFX_VERx10 >= 125
61    bits |= (pc->PSSStallSyncEnable) ?  ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
62 #endif
63 #if GFX_VER == 12
64    bits |= (pc->TileCacheFlushEnable) ?  ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
65 #endif
66 #if GFX_VER >= 12
67    bits |= (pc->HDCPipelineFlushEnable) ?  ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
68 #endif
69    bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
70    bits |= (pc->VFCacheInvalidationEnable) ?  ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
71    bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
72    bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
73    bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
74    bits |= (pc->InstructionCacheInvalidateEnable) ?  ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
75    bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
76    bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
77    bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
78 #if GFX_VERx10 == 125
79    bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0;
80    bits |= (pc->CCSFlushEnable) ? ANV_PIPE_CCS_CACHE_FLUSH_BIT : 0;
81 #endif
82    return bits;
83 }
84 
85 #define anv_debug_dump_pc(pc, reason) \
86    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
87       fputs("pc: emit PC=( ", stdout); \
88       anv_dump_pipe_bits(convert_pc_to_bits(&(pc)), stdout);   \
89       fprintf(stdout, ") reason: %s\n", reason); \
90    }
91 
92 void
genX(cmd_buffer_emit_state_base_address)93 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
94 {
95    if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
96        anv_cmd_buffer_is_video_queue(cmd_buffer))
97       return;
98 
99    struct anv_device *device = cmd_buffer->device;
100    uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
101 
102    /* If we are emitting a new state base address we probably need to re-emit
103     * binding tables.
104     */
105    cmd_buffer->state.descriptors_dirty |= ~0;
106 
107 #if GFX_VERx10 >= 125
108    genx_batch_emit_pipe_control(&cmd_buffer->batch,
109                                 cmd_buffer->device->info,
110                                 cmd_buffer->state.current_pipeline,
111                                 ANV_PIPE_CS_STALL_BIT);
112    anv_batch_emit(
113       &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
114       btpa.BindingTablePoolBaseAddress =
115          anv_cmd_buffer_surface_base_address(cmd_buffer);
116       btpa.BindingTablePoolBufferSize = device->physical->va.binding_table_pool.size / 4096;
117       btpa.MOCS = mocs;
118    }
119 #else /* GFX_VERx10 < 125 */
120    /* Emit a render target cache flush.
121     *
122     * This isn't documented anywhere in the PRM.  However, it seems to be
123     * necessary prior to changing the surface state base address.  Without
124     * this, we get GPU hangs when using multi-level command buffers which
125     * clear depth, reset state base address, and then go render stuff.
126     */
127    genx_batch_emit_pipe_control
128       (&cmd_buffer->batch, cmd_buffer->device->info,
129        cmd_buffer->state.current_pipeline,
130 #if GFX_VER >= 12
131        ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
132 #else
133        ANV_PIPE_DATA_CACHE_FLUSH_BIT |
134 #endif
135        ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
136        ANV_PIPE_CS_STALL_BIT);
137 
138 #if INTEL_NEEDS_WA_1607854226
139    /* Wa_1607854226:
140     *
141     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
142     *  mode by putting the pipeline temporarily in 3D mode.
143     */
144    uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
145    genX(flush_pipeline_select_3d)(cmd_buffer);
146 #endif
147 
148    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
149       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
150       sba.GeneralStateMOCS = mocs;
151       sba.GeneralStateBaseAddressModifyEnable = true;
152 
153       sba.StatelessDataPortAccessMOCS = mocs;
154 
155       sba.SurfaceStateBaseAddress =
156          anv_cmd_buffer_surface_base_address(cmd_buffer);
157       sba.SurfaceStateMOCS = mocs;
158       sba.SurfaceStateBaseAddressModifyEnable = true;
159 
160       sba.DynamicStateBaseAddress =
161          (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
162       sba.DynamicStateMOCS = mocs;
163       sba.DynamicStateBaseAddressModifyEnable = true;
164 
165       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
166       sba.IndirectObjectMOCS = mocs;
167       sba.IndirectObjectBaseAddressModifyEnable = true;
168 
169       sba.InstructionBaseAddress =
170          (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
171       sba.InstructionMOCS = mocs;
172       sba.InstructionBaseAddressModifyEnable = true;
173 
174       sba.GeneralStateBufferSize       = 0xfffff;
175       sba.IndirectObjectBufferSize     = 0xfffff;
176       sba.DynamicStateBufferSize       = (device->physical->va.dynamic_state_pool.size +
177                                           device->physical->va.sampler_state_pool.size) / 4096;
178       sba.InstructionBufferSize        = device->physical->va.instruction_state_pool.size / 4096;
179       sba.GeneralStateBufferSizeModifyEnable    = true;
180       sba.IndirectObjectBufferSizeModifyEnable  = true;
181       sba.DynamicStateBufferSizeModifyEnable    = true;
182       sba.InstructionBuffersizeModifyEnable     = true;
183 
184 #if GFX_VER >= 11
185       sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
186       sba.BindlessSamplerStateBufferSize = 0;
187       sba.BindlessSamplerStateMOCS = mocs;
188       sba.BindlessSamplerStateBaseAddressModifyEnable = true;
189 #endif
190 
191       if (!device->physical->indirect_descriptors) {
192 #if GFX_VERx10 >= 125
193          /* Bindless Surface State & Bindless Sampler State are aligned to the
194           * same heap
195           */
196          sba.BindlessSurfaceStateBaseAddress =
197             (struct anv_address) { .offset =
198             device->physical->va.binding_table_pool.addr, };
199          sba.BindlessSurfaceStateSize =
200             (device->physical->va.internal_surface_state_pool.size +
201              device->physical->va.bindless_surface_state_pool.size) - 1;
202          sba.BindlessSurfaceStateMOCS = mocs;
203          sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
204 #else
205          unreachable("Direct descriptor not supported");
206 #endif
207       } else {
208          sba.BindlessSurfaceStateBaseAddress =
209             (struct anv_address) { .offset =
210             device->physical->va.bindless_surface_state_pool.addr,
211          };
212          sba.BindlessSurfaceStateSize =
213             anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1;
214          sba.BindlessSurfaceStateMOCS = mocs;
215          sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
216       }
217 
218 #if GFX_VERx10 >= 125
219       sba.L1CacheControl = L1CC_WB;
220 #endif
221    }
222 
223 #if INTEL_NEEDS_WA_1607854226
224    /* Wa_1607854226:
225     *
226     *  Put the pipeline back into its current mode.
227     */
228    if (gfx12_wa_pipeline != UINT32_MAX)
229       genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
230 #endif
231 
232 #endif /* GFX_VERx10 < 125 */
233 
234    /* After re-setting the surface state base address, we have to do some
235     * cache flushing so that the sampler engine will pick up the new
236     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
237     * Shared Function > 3D Sampler > State > State Caching (page 96):
238     *
239     *    Coherency with system memory in the state cache, like the texture
240     *    cache is handled partially by software. It is expected that the
241     *    command stream or shader will issue Cache Flush operation or
242     *    Cache_Flush sampler message to ensure that the L1 cache remains
243     *    coherent with system memory.
244     *
245     *    [...]
246     *
247     *    Whenever the value of the Dynamic_State_Base_Addr,
248     *    Surface_State_Base_Addr are altered, the L1 state cache must be
249     *    invalidated to ensure the new surface or sampler state is fetched
250     *    from system memory.
251     *
252     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
253     * which, according the PIPE_CONTROL instruction documentation in the
254     * Broadwell PRM:
255     *
256     *    Setting this bit is independent of any other bit in this packet.
257     *    This bit controls the invalidation of the L1 and L2 state caches
258     *    at the top of the pipe i.e. at the parsing time.
259     *
260     * Unfortunately, experimentation seems to indicate that state cache
261     * invalidation through a PIPE_CONTROL does nothing whatsoever in
262     * regards to surface state and binding tables.  In stead, it seems that
263     * invalidating the texture cache is what is actually needed.
264     *
265     * XXX:  As far as we have been able to determine through
266     * experimentation, shows that flush the texture cache appears to be
267     * sufficient.  The theory here is that all of the sampling/rendering
268     * units cache the binding table in the texture cache.  However, we have
269     * yet to be able to actually confirm this.
270     *
271     * Wa_14013910100:
272     *
273     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
274     *   or program pipe control with Instruction cache invalidate post
275     *   STATE_BASE_ADDRESS command"
276     */
277    enum anv_pipe_bits bits =
278       ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
279       ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
280 #if GFX_VERx10 == 125
281       ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
282 #endif
283       ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
284 
285 #if GFX_VER >= 9 && GFX_VER <= 11
286       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
287        *
288        *    "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
289        *     always set for GPGPU workloads when “Texture Cache Invalidation
290        *     Enable” bit is set".
291        *
292        * Workaround stopped appearing in TGL PRMs.
293        */
294       if (cmd_buffer->state.current_pipeline == GPGPU)
295          bits |= ANV_PIPE_CS_STALL_BIT;
296 #endif
297    genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
298                                 cmd_buffer->state.current_pipeline,
299                                 bits);
300 }
301 
302 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)303 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
304                   struct anv_address addr)
305 {
306    VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
307                                            addr.bo);
308 
309    if (unlikely(result != VK_SUCCESS))
310       anv_batch_set_error(&cmd_buffer->batch, result);
311 }
312 
313 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,const struct anv_surface_state * state)314 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
315                          const struct anv_surface_state *state)
316 {
317    assert(!anv_address_is_null(state->address));
318    add_surface_reloc(cmd_buffer, state->address);
319 
320    if (!anv_address_is_null(state->aux_address)) {
321       VkResult result =
322          anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
323                                state->aux_address.bo);
324       if (result != VK_SUCCESS)
325          anv_batch_set_error(&cmd_buffer->batch, result);
326    }
327 
328    if (!anv_address_is_null(state->clear_address)) {
329       VkResult result =
330          anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
331                                state->clear_address.bo);
332       if (result != VK_SUCCESS)
333          anv_batch_set_error(&cmd_buffer->batch, result);
334    }
335 }
336 
337 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
338  * the initial layout is undefined, the HiZ buffer and depth buffer will
339  * represent the same data at the end of this operation.
340  */
341 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)342 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
343                         const struct anv_image *image,
344                         uint32_t base_layer, uint32_t layer_count,
345                         VkImageLayout initial_layout,
346                         VkImageLayout final_layout,
347                         bool will_full_fast_clear)
348 {
349    const uint32_t depth_plane =
350       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
351    if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
352       return;
353 
354    /* If will_full_fast_clear is set, the caller promises to fast-clear the
355     * largest portion of the specified range as it can.  For depth images,
356     * that means the entire image because we don't support multi-LOD HiZ.
357     */
358    assert(image->planes[0].primary_surface.isl.levels == 1);
359    if (will_full_fast_clear)
360       return;
361 
362    const enum isl_aux_state initial_state =
363       anv_layout_to_aux_state(cmd_buffer->device->info, image,
364                               VK_IMAGE_ASPECT_DEPTH_BIT,
365                               initial_layout,
366                               cmd_buffer->queue_family->queueFlags);
367    const enum isl_aux_state final_state =
368       anv_layout_to_aux_state(cmd_buffer->device->info, image,
369                               VK_IMAGE_ASPECT_DEPTH_BIT,
370                               final_layout,
371                               cmd_buffer->queue_family->queueFlags);
372 
373    const bool initial_depth_valid =
374       isl_aux_state_has_valid_primary(initial_state);
375    const bool initial_hiz_valid =
376       isl_aux_state_has_valid_aux(initial_state);
377    const bool final_needs_depth =
378       isl_aux_state_has_valid_primary(final_state);
379    const bool final_needs_hiz =
380       isl_aux_state_has_valid_aux(final_state);
381 
382    /* Getting into the pass-through state for Depth is tricky and involves
383     * both a resolve and an ambiguate.  We don't handle that state right now
384     * as anv_layout_to_aux_state never returns it.
385     */
386    assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
387 
388    if (final_needs_depth && !initial_depth_valid) {
389       assert(initial_hiz_valid);
390       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
391                        0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
392    } else if (final_needs_hiz && !initial_hiz_valid) {
393       assert(initial_depth_valid);
394       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
395                        0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
396    }
397 
398    /* Additional tile cache flush for MTL:
399     *
400     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
401     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
402     */
403    if (intel_device_info_is_mtl(cmd_buffer->device->info) &&
404        image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
405        final_needs_depth && !initial_depth_valid) {
406       anv_add_pending_pipe_bits(cmd_buffer,
407                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
408                                 "HIZ-CCS flush");
409    }
410 }
411 
412 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
413  * the initial layout is undefined, the HiZ buffer and depth buffer will
414  * represent the same data at the end of this operation.
415  */
416 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)417 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
418                           const struct anv_image *image,
419                           uint32_t base_level, uint32_t level_count,
420                           uint32_t base_layer, uint32_t layer_count,
421                           VkImageLayout initial_layout,
422                           VkImageLayout final_layout,
423                           bool will_full_fast_clear)
424 {
425 #if GFX_VER == 12
426    const uint32_t plane =
427       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
428    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
429       return;
430 
431    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
432         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
433        cmd_buffer->device->info->has_aux_map) {
434       /* If will_full_fast_clear is set, the caller promises to fast-clear the
435        * largest portion of the specified range as it can.
436        */
437       if (will_full_fast_clear)
438          return;
439 
440       for (uint32_t l = 0; l < level_count; l++) {
441          const uint32_t level = base_level + l;
442          const VkRect2D clear_rect = {
443             .offset.x = 0,
444             .offset.y = 0,
445             .extent.width = u_minify(image->vk.extent.width, level),
446             .extent.height = u_minify(image->vk.extent.height, level),
447          };
448 
449          uint32_t aux_layers =
450             anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
451          uint32_t level_layer_count =
452             MIN2(layer_count, aux_layers - base_layer);
453 
454          /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
455           * Enable:
456           *
457           *    "When enabled, Stencil Buffer needs to be initialized via
458           *    stencil clear (HZ_OP) before any renderpass."
459           */
460          anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
461                              level, base_layer, level_layer_count,
462                              clear_rect, 0 /* Stencil clear value */);
463       }
464    }
465 
466    /* Additional tile cache flush for MTL:
467     *
468     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
469     * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
470     */
471    if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
472       anv_add_pending_pipe_bits(cmd_buffer,
473                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
474                                 "HIZ-CCS flush");
475    }
476 #endif
477 }
478 
479 #define MI_PREDICATE_SRC0    0x2400
480 #define MI_PREDICATE_SRC1    0x2408
481 #define MI_PREDICATE_RESULT  0x2418
482 
483 static void
set_image_compressed_bit(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t base_layer,uint32_t layer_count,bool compressed)484 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
485                          const struct anv_image *image,
486                          VkImageAspectFlagBits aspect,
487                          uint32_t level,
488                          uint32_t base_layer, uint32_t layer_count,
489                          bool compressed)
490 {
491    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
492 
493    /* We only have compression tracking for CCS_E */
494    if (!isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage))
495       return;
496 
497    for (uint32_t a = 0; a < layer_count; a++) {
498       uint32_t layer = base_layer + a;
499       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
500          sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,
501                                                             image, aspect,
502                                                             level, layer);
503          sdi.ImmediateData = compressed ? UINT32_MAX : 0;
504       }
505    }
506 
507    /* FCV_CCS_E images are automatically fast cleared to default value at
508     * render time. In order to account for this, anv should set the the
509     * appropriate fast clear state for level0/layer0.
510     *
511     * At the moment, tracking the fast clear state for higher levels/layers is
512     * neither supported, nor do we enter a situation where it is a concern.
513     */
514    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E &&
515        base_layer == 0 && level == 0) {
516       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
517          sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
518                                                           image, aspect);
519          sdi.ImmediateData = ANV_FAST_CLEAR_DEFAULT_VALUE;
520       }
521    }
522 }
523 
524 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)525 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
526                            const struct anv_image *image,
527                            VkImageAspectFlagBits aspect,
528                            enum anv_fast_clear_type fast_clear)
529 {
530    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
531       sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
532                                                        image, aspect);
533       sdi.ImmediateData = fast_clear;
534    }
535 
536    /* Whenever we have fast-clear, we consider that slice to be compressed.
537     * This makes building predicates much easier.
538     */
539    if (fast_clear != ANV_FAST_CLEAR_NONE)
540       set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
541 }
542 
543 /* This is only really practical on haswell and above because it requires
544  * MI math in order to get it correct.
545  */
546 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)547 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
548                                   const struct anv_image *image,
549                                   VkImageAspectFlagBits aspect,
550                                   uint32_t level, uint32_t array_layer,
551                                   enum isl_aux_op resolve_op,
552                                   enum anv_fast_clear_type fast_clear_supported)
553 {
554    struct anv_address addr = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
555                                                                 image, aspect);
556    struct mi_builder b;
557    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
558    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
559    mi_builder_set_mocs(&b, mocs);
560 
561    const struct mi_value fast_clear_type = mi_mem32(addr);
562 
563    if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
564       /* In this case, we're doing a full resolve which means we want the
565        * resolve to happen if any compression (including fast-clears) is
566        * present.
567        *
568        * In order to simplify the logic a bit, we make the assumption that,
569        * if the first slice has been fast-cleared, it is also marked as
570        * compressed.  See also set_image_fast_clear_state.
571        */
572       const struct mi_value compression_state =
573          mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,
574                                                        image, aspect,
575                                                        level, array_layer));
576       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
577       mi_store(&b, compression_state, mi_imm(0));
578 
579       if (level == 0 && array_layer == 0) {
580          /* If the predicate is true, we want to write 0 to the fast clear type
581           * and, if it's false, leave it alone.  We can do this by writing
582           *
583           * clear_type = clear_type & ~predicate;
584           */
585          struct mi_value new_fast_clear_type =
586             mi_iand(&b, fast_clear_type,
587                         mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
588          mi_store(&b, fast_clear_type, new_fast_clear_type);
589       }
590    } else if (level == 0 && array_layer == 0) {
591       /* In this case, we are doing a partial resolve to get rid of fast-clear
592        * colors.  We don't care about the compression state but we do care
593        * about how much fast clear is allowed by the final layout.
594        */
595       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
596       assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
597 
598       /* We need to compute (fast_clear_supported < image->fast_clear) */
599       struct mi_value pred =
600          mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
601       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
602 
603       /* If the predicate is true, we want to write 0 to the fast clear type
604        * and, if it's false, leave it alone.  We can do this by writing
605        *
606        * clear_type = clear_type & ~predicate;
607        */
608       struct mi_value new_fast_clear_type =
609          mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
610       mi_store(&b, fast_clear_type, new_fast_clear_type);
611    } else {
612       /* In this case, we're trying to do a partial resolve on a slice that
613        * doesn't have clear color.  There's nothing to do.
614        */
615       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
616       return;
617    }
618 
619    /* Set src1 to 0 and use a != condition */
620    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
621 
622    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
623       mip.LoadOperation    = LOAD_LOADINV;
624       mip.CombineOperation = COMBINE_SET;
625       mip.CompareOperation = COMPARE_SRCS_EQUAL;
626    }
627 }
628 
629 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)630 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
631                                const struct anv_image *image,
632                                enum isl_format format,
633                                struct isl_swizzle swizzle,
634                                VkImageAspectFlagBits aspect,
635                                uint32_t level, uint32_t array_layer,
636                                enum isl_aux_op resolve_op,
637                                enum anv_fast_clear_type fast_clear_supported)
638 {
639    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
640 
641    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
642                                      aspect, level, array_layer,
643                                      resolve_op, fast_clear_supported);
644 
645    /* CCS_D only supports full resolves and BLORP will assert on us if we try
646     * to do a partial resolve on a CCS_D surface.
647     */
648    if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
649        image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
650       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
651 
652    anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
653                     level, array_layer, 1, resolve_op, NULL, true);
654 }
655 
656 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)657 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
658                                const struct anv_image *image,
659                                enum isl_format format,
660                                struct isl_swizzle swizzle,
661                                VkImageAspectFlagBits aspect,
662                                uint32_t array_layer,
663                                enum isl_aux_op resolve_op,
664                                enum anv_fast_clear_type fast_clear_supported)
665 {
666    assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
667    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
668 
669    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
670                                      aspect, 0, array_layer,
671                                      resolve_op, fast_clear_supported);
672 
673    anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
674                     array_layer, 1, resolve_op, NULL, true);
675 }
676 
677 void
genX(cmd_buffer_mark_image_written)678 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
679                                     const struct anv_image *image,
680                                     VkImageAspectFlagBits aspect,
681                                     enum isl_aux_usage aux_usage,
682                                     uint32_t level,
683                                     uint32_t base_layer,
684                                     uint32_t layer_count)
685 {
686    /* The aspect must be exactly one of the image aspects. */
687    assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
688 
689    /* Filter out aux usages that don't have any compression tracking.
690     * Note: We only have compression tracking for CCS_E images, but it's
691     * possible for a CCS_E enabled image to have a subresource with a different
692     * aux usage.
693     */
694    if (!isl_aux_usage_has_compression(aux_usage))
695       return;
696 
697    set_image_compressed_bit(cmd_buffer, image, aspect,
698                             level, base_layer, layer_count, true);
699 }
700 
701 static void
init_fast_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect)702 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
703                       const struct anv_image *image,
704                       VkImageAspectFlagBits aspect)
705 {
706    assert(cmd_buffer && image);
707    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
708 
709    /* Initialize the struct fields that are accessed for fast clears so that
710     * the HW restrictions on the field values are satisfied.
711     *
712     * On generations that do not support indirect clear color natively, we
713     * can just skip initializing the values, because they will be set by
714     * BLORP before actually doing the fast clear.
715     *
716     * For newer generations, we may not be able to skip initialization.
717     * Testing shows that writing to CLEAR_COLOR causes corruption if
718     * the surface is currently being used. So, care must be taken here.
719     * There are two cases that we consider:
720     *
721     *    1. For CCS_E without FCV, we can skip initializing the color-related
722     *       fields, just like on the older platforms. Also, DWORDS 6 and 7
723     *       are marked MBZ (or have a usable field on gfx11), but we can skip
724     *       initializing them because in practice these fields need other
725     *       state to be programmed for their values to matter.
726     *
727     *    2. When the FCV optimization is enabled, we must initialize the
728     *       color-related fields. Otherwise, the engine might reference their
729     *       uninitialized contents before we fill them for a manual fast clear
730     *       with BLORP. Although the surface may be in use, no synchronization
731     *       is needed before initialization. The only possible clear color we
732     *       support in this mode is 0.
733     */
734 #if GFX_VER == 12
735    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
736 
737    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
738       assert(!image->planes[plane].can_non_zero_fast_clear);
739       assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 32);
740 
741       unsigned num_dwords = 6;
742       struct anv_address addr =
743          anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
744 
745       for (unsigned i = 0; i < num_dwords; i++) {
746          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
747             sdi.Address = addr;
748             sdi.Address.offset += i * 4;
749             sdi.ImmediateData = 0;
750             sdi.ForceWriteCompletionCheck = i == (num_dwords - 1);
751          }
752       }
753    }
754 #endif
755 }
756 
757 /* Copy the fast-clear value dword(s) between a surface state object and an
758  * image's fast clear state buffer.
759  */
760 void
genX(load_image_clear_color)761 genX(load_image_clear_color)(struct anv_cmd_buffer *cmd_buffer,
762                              struct anv_state surface_state,
763                              const struct anv_image *image)
764 {
765 #if GFX_VER < 10
766    assert(cmd_buffer && image);
767    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
768 
769    struct anv_address ss_clear_addr =
770       anv_state_pool_state_address(
771          &cmd_buffer->device->internal_surface_state_pool,
772          (struct anv_state) {
773             .offset = surface_state.offset +
774                       cmd_buffer->device->isl_dev.ss.clear_value_offset
775          });
776    const struct anv_address entry_addr =
777       anv_image_get_clear_color_addr(cmd_buffer->device, image,
778                                      VK_IMAGE_ASPECT_COLOR_BIT);
779    unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
780 
781    struct mi_builder b;
782    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
783 
784    mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
785 
786    /* Updating a surface state object may require that the state cache be
787     * invalidated. From the SKL PRM, Shared Functions -> State -> State
788     * Caching:
789     *
790     *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
791     *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
792     *    modified [...], the L1 state cache must be invalidated to ensure
793     *    the new surface or sampler state is fetched from system memory.
794     *
795     * In testing, SKL doesn't actually seem to need this, but HSW does.
796     */
797    anv_add_pending_pipe_bits(cmd_buffer,
798                              ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
799                              "after load_image_clear_color surface state update");
800 #endif
801 }
802 
803 void
genX(set_fast_clear_state)804 genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
805                            const struct anv_image *image,
806                            const enum isl_format format,
807                            union isl_color_value clear_color)
808 {
809    if (isl_color_value_is_zero(clear_color, format)) {
810       /* This image has the auxiliary buffer enabled. We can mark the
811        * subresource as not needing a resolve because the clear color
812        * will match what's in every RENDER_SURFACE_STATE object when
813        * it's being used for sampling.
814        */
815       set_image_fast_clear_state(cmd_buffer, image,
816                                  VK_IMAGE_ASPECT_COLOR_BIT,
817                                  ANV_FAST_CLEAR_DEFAULT_VALUE);
818    } else {
819       set_image_fast_clear_state(cmd_buffer, image,
820                                  VK_IMAGE_ASPECT_COLOR_BIT,
821                                  ANV_FAST_CLEAR_ANY);
822    }
823 }
824 
825 /**
826  * @brief Transitions a color buffer from one layout to another.
827  *
828  * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
829  * more information.
830  *
831  * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
832  * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
833  *                    this represents the maximum layers to transition at each
834  *                    specified miplevel.
835  */
836 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)837 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
838                         const struct anv_image *image,
839                         VkImageAspectFlagBits aspect,
840                         const uint32_t base_level, uint32_t level_count,
841                         uint32_t base_layer, uint32_t layer_count,
842                         VkImageLayout initial_layout,
843                         VkImageLayout final_layout,
844                         uint32_t src_queue_family,
845                         uint32_t dst_queue_family,
846                         bool will_full_fast_clear)
847 {
848    struct anv_device *device = cmd_buffer->device;
849    const struct intel_device_info *devinfo = device->info;
850    /* Validate the inputs. */
851    assert(cmd_buffer);
852    assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
853    /* These values aren't supported for simplicity's sake. */
854    assert(level_count != VK_REMAINING_MIP_LEVELS &&
855           layer_count != VK_REMAINING_ARRAY_LAYERS);
856    /* Ensure the subresource range is valid. */
857    UNUSED uint64_t last_level_num = base_level + level_count;
858    const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
859    UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
860    assert((uint64_t)base_layer + layer_count  <= image_layers);
861    assert(last_level_num <= image->vk.mip_levels);
862    /* If there is a layout transfer, the final layout cannot be undefined or
863     * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
864     */
865    assert(initial_layout == final_layout ||
866           (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
867            final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
868    const struct isl_drm_modifier_info *isl_mod_info =
869       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
870       ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
871       : NULL;
872 
873    const bool src_queue_external =
874       src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
875       src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
876 
877    const bool dst_queue_external =
878       dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
879       dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
880 
881    /* If the queues are external, consider the first queue family flags
882     * (should be the most capable)
883     */
884    const VkQueueFlagBits src_queue_flags =
885       device->physical->queue.families[
886          (src_queue_external || src_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
887          0 : src_queue_family].queueFlags;
888    const VkQueueFlagBits dst_queue_flags =
889       device->physical->queue.families[
890          (dst_queue_external || dst_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
891          0 : dst_queue_family].queueFlags;
892 
893    /* Simultaneous acquire and release on external queues is illegal. */
894    assert(!src_queue_external || !dst_queue_external);
895 
896    /* Ownership transition on an external queue requires special action if the
897     * image has a DRM format modifier because we store image data in
898     * a driver-private bo which is inaccessible to the external queue.
899     */
900    const bool private_binding_acquire =
901       src_queue_external &&
902       anv_image_is_externally_shared(image) &&
903       anv_image_has_private_binding(image);
904 
905    const bool private_binding_release =
906       dst_queue_external &&
907       anv_image_is_externally_shared(image) &&
908       anv_image_has_private_binding(image);
909 
910    if (initial_layout == final_layout &&
911        !private_binding_acquire && !private_binding_release) {
912       /* No work is needed. */
913        return;
914    }
915 
916    /**
917     * Section 7.7.4 of the Vulkan 1.3.260 spec says:
918     *
919     *    If the transfer is via an image memory barrier, and an image layout
920     *    transition is desired, then the values of oldLayout and newLayout in the
921     *    release operation's memory barrier must be equal to values of oldLayout
922     *    and newLayout in the acquire operation's memory barrier. Although the
923     *    image layout transition is submitted twice, it will only be executed
924     *    once. A layout transition specified in this way happens-after the
925     *    release operation and happens-before the acquire operation.
926     *
927     * Because we know that we get match transition on each queue, we choose to
928     * only do the work on one queue type : RENDER. In the cases where we do
929     * transitions between COMPUTE & TRANSFER, we should have matching
930     * aux/fast_clear value which would trigger no work in the code below.
931     */
932    if (!(src_queue_external || dst_queue_external) &&
933        src_queue_family != VK_QUEUE_FAMILY_IGNORED &&
934        dst_queue_family != VK_QUEUE_FAMILY_IGNORED &&
935        src_queue_family != dst_queue_family) {
936       enum intel_engine_class src_engine =
937          cmd_buffer->queue_family->engine_class;
938       if (src_engine != INTEL_ENGINE_CLASS_RENDER)
939          return;
940    }
941 
942    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
943 
944    if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
945       return;
946 
947    enum isl_aux_usage initial_aux_usage =
948       anv_layout_to_aux_usage(devinfo, image, aspect, 0,
949                               initial_layout, src_queue_flags);
950    enum isl_aux_usage final_aux_usage =
951       anv_layout_to_aux_usage(devinfo, image, aspect, 0,
952                               final_layout, dst_queue_flags);
953    enum anv_fast_clear_type initial_fast_clear =
954       anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout,
955                                     src_queue_flags);
956    enum anv_fast_clear_type final_fast_clear =
957       anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout,
958                                     dst_queue_flags);
959 
960    /* We must override the anv_layout_to_* functions because they are unaware
961     * of acquire/release direction.
962     */
963    if (private_binding_acquire) {
964       initial_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
965          image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
966       initial_fast_clear = isl_mod_info->supports_clear_color ?
967          initial_fast_clear : ANV_FAST_CLEAR_NONE;
968    } else if (private_binding_release) {
969       final_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
970          image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
971       final_fast_clear = isl_mod_info->supports_clear_color ?
972          final_fast_clear : ANV_FAST_CLEAR_NONE;
973    }
974 
975    assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
976 
977    /* The following layouts are equivalent for non-linear images. */
978    const bool initial_layout_undefined =
979       initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
980       initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
981 
982    bool must_init_fast_clear_state = false;
983    bool must_init_aux_surface = false;
984 
985    if (initial_layout_undefined) {
986       /* The subresource may have been aliased and populated with arbitrary
987        * data.
988        */
989       must_init_fast_clear_state = true;
990 
991       if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
992           devinfo->has_illegal_ccs_values) {
993 
994          must_init_aux_surface = true;
995 
996       } else {
997          assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
998 
999          /* We can start using the CCS immediately without ambiguating. The
1000           * two conditions that enable this are:
1001           *
1002           * 1) The device treats all possible CCS values as legal. In other
1003           *    words, we can't confuse the hardware with random bits in the
1004           *    CCS.
1005           *
1006           * 2) We enable compression on all writable image layouts. The CCS
1007           *    will receive all writes and will therefore always be in sync
1008           *    with the main surface.
1009           *
1010           *    If we were to disable compression on some writable layouts, the
1011           *    CCS could get out of sync with the main surface and the app
1012           *    could lose the data it wrote previously. For example, this
1013           *    could happen if an app: transitions from UNDEFINED w/o
1014           *    ambiguating -> renders with AUX_NONE -> samples with AUX_CCS.
1015           *
1016           * The second condition is asserted below, but could be moved
1017           * elsewhere for more coverage (we're only checking transitions from
1018           * an undefined layout).
1019           */
1020          assert(vk_image_layout_is_read_only(final_layout, aspect) ||
1021                 (final_aux_usage != ISL_AUX_USAGE_NONE));
1022 
1023          must_init_aux_surface = false;
1024       }
1025 
1026    } else if (private_binding_acquire) {
1027       /* The fast clear state lives in a driver-private bo, and therefore the
1028        * external/foreign queue is unaware of it.
1029        *
1030        * If this is the first time we are accessing the image, then the fast
1031        * clear state is uninitialized.
1032        *
1033        * If this is NOT the first time we are accessing the image, then the fast
1034        * clear state may still be valid and correct due to the resolve during
1035        * our most recent ownership release.  However, we do not track the aux
1036        * state with MI stores, and therefore must assume the worst-case: that
1037        * this is the first time we are accessing the image.
1038        */
1039       assert(image->planes[plane].fast_clear_memory_range.binding ==
1040               ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1041       must_init_fast_clear_state = true;
1042 
1043       if (anv_image_get_aux_memory_range(image, plane)->binding ==
1044           ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1045          /* The aux surface, like the fast clear state, lives in
1046           * a driver-private bo.  We must initialize the aux surface for the
1047           * same reasons we must initialize the fast clear state.
1048           */
1049          must_init_aux_surface = true;
1050       } else {
1051          /* The aux surface, unlike the fast clear state, lives in
1052           * application-visible VkDeviceMemory and is shared with the
1053           * external/foreign queue. Therefore, when we acquire ownership of the
1054           * image with a defined VkImageLayout, the aux surface is valid and has
1055           * the aux state required by the modifier.
1056           */
1057          must_init_aux_surface = false;
1058       }
1059    }
1060 
1061    if (must_init_fast_clear_state) {
1062       if (base_level == 0 && base_layer == 0) {
1063          set_image_fast_clear_state(cmd_buffer, image, aspect,
1064                                     ANV_FAST_CLEAR_NONE);
1065       }
1066       init_fast_clear_color(cmd_buffer, image, aspect);
1067    }
1068 
1069    if (must_init_aux_surface) {
1070       assert(must_init_fast_clear_state);
1071 
1072       /* Initialize the aux buffers to enable correct rendering.  In order to
1073        * ensure that things such as storage images work correctly, aux buffers
1074        * need to be initialized to valid data.
1075        *
1076        * Having an aux buffer with invalid data is a problem for two reasons:
1077        *
1078        *  1) Having an invalid value in the buffer can confuse the hardware.
1079        *     For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1080        *     invalid and leads to the hardware doing strange things.  It
1081        *     doesn't hang as far as we can tell but rendering corruption can
1082        *     occur.
1083        *
1084        *  2) If this transition is into the GENERAL layout and we then use the
1085        *     image as a storage image, then we must have the aux buffer in the
1086        *     pass-through state so that, if we then go to texture from the
1087        *     image, we get the results of our storage image writes and not the
1088        *     fast clear color or other random data.
1089        *
1090        * For CCS both of the problems above are real demonstrable issues.  In
1091        * that case, the only thing we can do is to perform an ambiguate to
1092        * transition the aux surface into the pass-through state.
1093        *
1094        * For MCS, (2) is never an issue because we don't support multisampled
1095        * storage images.  In theory, issue (1) is a problem with MCS but we've
1096        * never seen it in the wild.  For 4x and 16x, all bit patterns could,
1097        * in theory, be interpreted as something but we don't know that all bit
1098        * patterns are actually valid.  For 2x and 8x, you could easily end up
1099        * with the MCS referring to an invalid plane because not all bits of
1100        * the MCS value are actually used.  Even though we've never seen issues
1101        * in the wild, it's best to play it safe and initialize the MCS.  We
1102        * could use a fast-clear for MCS because we only ever touch from render
1103        * and texture (no image load store). However, due to WA 14013111325,
1104        * we choose to ambiguate MCS as well.
1105        */
1106       if (image->vk.samples == 1) {
1107          for (uint32_t l = 0; l < level_count; l++) {
1108             const uint32_t level = base_level + l;
1109 
1110             uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1111             if (base_layer >= aux_layers)
1112                break; /* We will only get fewer layers as level increases */
1113             uint32_t level_layer_count =
1114                MIN2(layer_count, aux_layers - base_layer);
1115 
1116             /* If will_full_fast_clear is set, the caller promises to
1117              * fast-clear the largest portion of the specified range as it can.
1118              * For color images, that means only the first LOD and array slice.
1119              */
1120             if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1121                base_layer++;
1122                level_layer_count--;
1123                if (level_layer_count == 0)
1124                   continue;
1125             }
1126 
1127             anv_image_ccs_op(cmd_buffer, image,
1128                              image->planes[plane].primary_surface.isl.format,
1129                              ISL_SWIZZLE_IDENTITY,
1130                              aspect, level, base_layer, level_layer_count,
1131                              ISL_AUX_OP_AMBIGUATE, NULL, false);
1132 
1133             set_image_compressed_bit(cmd_buffer, image, aspect, level,
1134                                      base_layer, level_layer_count, false);
1135          }
1136       } else {
1137          /* If will_full_fast_clear is set, the caller promises to fast-clear
1138           * the largest portion of the specified range as it can.
1139           */
1140          if (will_full_fast_clear)
1141             return;
1142 
1143          /* If will_full_fast_clear is set, the caller promises to fast-clear
1144           * the largest portion of the specified range as it can.
1145           */
1146          if (will_full_fast_clear)
1147             return;
1148 
1149          assert(base_level == 0 && level_count == 1);
1150          anv_image_mcs_op(cmd_buffer, image,
1151                           image->planes[plane].primary_surface.isl.format,
1152                           ISL_SWIZZLE_IDENTITY,
1153                           aspect, base_layer, layer_count,
1154                           ISL_AUX_OP_AMBIGUATE, NULL, false);
1155       }
1156       return;
1157    }
1158 
1159    /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1160     * We can handle transitions between CCS_D/E to and from NONE.  What we
1161     * don't yet handle is switching between CCS_E and CCS_D within a given
1162     * image.  Doing so in a performant way requires more detailed aux state
1163     * tracking such as what is done in i965.  For now, just assume that we
1164     * only have one type of compression.
1165     */
1166    assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1167           final_aux_usage == ISL_AUX_USAGE_NONE ||
1168           initial_aux_usage == final_aux_usage);
1169 
1170    /* If initial aux usage is NONE, there is nothing to resolve */
1171    if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1172       return;
1173 
1174    enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1175 
1176    /* If the initial layout supports more fast clear than the final layout
1177     * then we need at least a partial resolve.
1178     */
1179    if (final_fast_clear < initial_fast_clear) {
1180       /* Partial resolves will actually only occur on layer 0/level 0. This
1181        * is generally okay because anv only allows explicit fast clears to
1182        * the first subresource.
1183        *
1184        * The situation is a bit different with FCV_CCS_E. With that aux
1185        * usage, implicit fast clears can occur on any layer and level.
1186        * anv doesn't track fast clear states for more than the first
1187        * subresource, so we need to assert that a layout transition doesn't
1188        * attempt to partial resolve the other subresources.
1189        *
1190        * At the moment, we don't enter such a situation, and partial resolves
1191        * for higher level/layer resources shouldn't be a concern.
1192        */
1193       if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
1194          assert(base_level == 0 && level_count == 1 &&
1195                 base_layer == 0 && layer_count == 1);
1196       }
1197       resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1198    }
1199 
1200    if (isl_aux_usage_has_ccs_e(initial_aux_usage) &&
1201        !isl_aux_usage_has_ccs_e(final_aux_usage))
1202       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1203 
1204    if (resolve_op == ISL_AUX_OP_NONE)
1205       return;
1206 
1207    /* Perform a resolve to synchronize data between the main and aux buffer.
1208     * Before we begin, we must satisfy the cache flushing requirement specified
1209     * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1210     *
1211     *    Any transition from any value in {Clear, Render, Resolve} to a
1212     *    different value in {Clear, Render, Resolve} requires end of pipe
1213     *    synchronization.
1214     *
1215     * We perform a flush of the write cache before and after the clear and
1216     * resolve operations to meet this requirement.
1217     *
1218     * Unlike other drawing, fast clear operations are not properly
1219     * synchronized. The first PIPE_CONTROL here likely ensures that the
1220     * contents of the previous render or clear hit the render target before we
1221     * resolve and the second likely ensures that the resolve is complete before
1222     * we do any more rendering or clearing.
1223     */
1224    anv_add_pending_pipe_bits(cmd_buffer,
1225                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1226                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1227                              "before transition RT");
1228 
1229    for (uint32_t l = 0; l < level_count; l++) {
1230       uint32_t level = base_level + l;
1231 
1232       uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1233       if (base_layer >= aux_layers)
1234          break; /* We will only get fewer layers as level increases */
1235       uint32_t level_layer_count =
1236          MIN2(layer_count, aux_layers - base_layer);
1237 
1238       for (uint32_t a = 0; a < level_layer_count; a++) {
1239          uint32_t array_layer = base_layer + a;
1240 
1241          /* If will_full_fast_clear is set, the caller promises to fast-clear
1242           * the largest portion of the specified range as it can.  For color
1243           * images, that means only the first LOD and array slice.
1244           */
1245          if (level == 0 && array_layer == 0 && will_full_fast_clear)
1246             continue;
1247 
1248          if (image->vk.samples == 1) {
1249             anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1250                                            image->planes[plane].primary_surface.isl.format,
1251                                            ISL_SWIZZLE_IDENTITY,
1252                                            aspect, level, array_layer, resolve_op,
1253                                            final_fast_clear);
1254          } else {
1255             /* We only support fast-clear on the first layer so partial
1256              * resolves should not be used on other layers as they will use
1257              * the clear color stored in memory that is only valid for layer0.
1258              */
1259             if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1260                 array_layer != 0)
1261                continue;
1262 
1263             anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1264                                            image->planes[plane].primary_surface.isl.format,
1265                                            ISL_SWIZZLE_IDENTITY,
1266                                            aspect, array_layer, resolve_op,
1267                                            final_fast_clear);
1268          }
1269       }
1270    }
1271 
1272    anv_add_pending_pipe_bits(cmd_buffer,
1273                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1274                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1275                              "after transition RT");
1276 }
1277 
1278 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1279 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1280                                 uint32_t color_att_count)
1281 {
1282    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1283 
1284    /* Reserve one for the NULL state. */
1285    unsigned num_states = 1 + color_att_count;
1286    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1287    const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1288    gfx->att_states =
1289       anv_cmd_buffer_alloc_surface_states(cmd_buffer, num_states);
1290    if (gfx->att_states.map == NULL)
1291       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1292 
1293    struct anv_state next_state = gfx->att_states;
1294    next_state.alloc_size = isl_dev->ss.size;
1295 
1296    gfx->null_surface_state = next_state;
1297    next_state.offset += ss_stride;
1298    next_state.map += ss_stride;
1299 
1300    gfx->color_att_count = color_att_count;
1301    for (uint32_t i = 0; i < color_att_count; i++) {
1302       gfx->color_att[i] = (struct anv_attachment) {
1303          .surface_state.state = next_state,
1304       };
1305       next_state.offset += ss_stride;
1306       next_state.map += ss_stride;
1307    }
1308    gfx->depth_att = (struct anv_attachment) { };
1309    gfx->stencil_att = (struct anv_attachment) { };
1310 
1311    return VK_SUCCESS;
1312 }
1313 
1314 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1315 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1316 {
1317    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1318 
1319    gfx->render_area = (VkRect2D) { };
1320    gfx->layer_count = 0;
1321    gfx->samples = 0;
1322 
1323    gfx->color_att_count = 0;
1324    gfx->depth_att = (struct anv_attachment) { };
1325    gfx->stencil_att = (struct anv_attachment) { };
1326    gfx->null_surface_state = ANV_STATE_NULL;
1327 }
1328 
1329 /**
1330  * Program the hardware to use the specified L3 configuration.
1331  */
1332 void
genX(cmd_buffer_config_l3)1333 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1334                            const struct intel_l3_config *cfg)
1335 {
1336    assert(cfg || GFX_VER >= 12);
1337    if (cfg == cmd_buffer->state.current_l3_config)
1338       return;
1339 
1340 #if GFX_VER >= 11
1341    /* On Gfx11+ we use only one config, so verify it remains the same and skip
1342     * the stalling programming entirely.
1343     */
1344    assert(cfg == cmd_buffer->device->l3_config);
1345 #else
1346    if (INTEL_DEBUG(DEBUG_L3)) {
1347       mesa_logd("L3 config transition: ");
1348       intel_dump_l3_config(cfg, stderr);
1349    }
1350 
1351    /* According to the hardware docs, the L3 partitioning can only be changed
1352     * while the pipeline is completely drained and the caches are flushed,
1353     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1354     */
1355    genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1356                                 cmd_buffer->state.current_pipeline,
1357                                 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1358                                 ANV_PIPE_CS_STALL_BIT);
1359 
1360    /* ...followed by a second pipelined PIPE_CONTROL that initiates
1361     * invalidation of the relevant caches.  Note that because RO invalidation
1362     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1363     * command is processed by the CS) we cannot combine it with the previous
1364     * stalling flush as the hardware documentation suggests, because that
1365     * would cause the CS to stall on previous rendering *after* RO
1366     * invalidation and wouldn't prevent the RO caches from being polluted by
1367     * concurrent rendering before the stall completes.  This intentionally
1368     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1369     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1370     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1371     * already guarantee that there is no concurrent GPGPU kernel execution
1372     * (see SKL HSD 2132585).
1373     */
1374    genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1375                                 cmd_buffer->state.current_pipeline,
1376                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1377                                 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1378                                 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1379                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
1380 
1381    /* Now send a third stalling flush to make sure that invalidation is
1382     * complete when the L3 configuration registers are modified.
1383     */
1384    genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1385                                 cmd_buffer->state.current_pipeline,
1386                                 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1387                                 ANV_PIPE_CS_STALL_BIT);
1388 
1389    genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1390 #endif /* GFX_VER >= 11 */
1391    cmd_buffer->state.current_l3_config = cfg;
1392 }
1393 
1394 ALWAYS_INLINE void
genX(invalidate_aux_map)1395 genX(invalidate_aux_map)(struct anv_batch *batch,
1396                          struct anv_device *device,
1397                          enum intel_engine_class engine_class,
1398                          enum anv_pipe_bits bits)
1399 {
1400 #if GFX_VER == 12
1401    if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) {
1402       uint32_t register_addr = 0;
1403       switch (engine_class) {
1404       case INTEL_ENGINE_CLASS_COMPUTE:
1405          register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
1406          break;
1407       case INTEL_ENGINE_CLASS_COPY:
1408 #if GFX_VERx10 >= 125
1409          register_addr = GENX(BCS_CCS_AUX_INV_num);
1410 #endif
1411          break;
1412       case INTEL_ENGINE_CLASS_VIDEO:
1413          register_addr = GENX(VD0_CCS_AUX_INV_num);
1414          break;
1415       case INTEL_ENGINE_CLASS_RENDER:
1416       default:
1417          register_addr = GENX(GFX_CCS_AUX_INV_num);
1418          break;
1419       }
1420 
1421       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
1422          lri.RegisterOffset = register_addr;
1423          lri.DataDWord = 1;
1424       }
1425 
1426       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
1427       if (intel_needs_workaround(device->info, 16018063123) &&
1428           engine_class == INTEL_ENGINE_CLASS_COPY) {
1429          genX(batch_emit_fast_color_dummy_blit)(batch, device);
1430       }
1431 
1432       /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1433        *
1434        *    "Poll Aux Invalidation bit once the invalidation is set
1435        *     (Register 4208 bit 0)"
1436        */
1437       anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
1438          sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
1439          sem.WaitMode = PollingMode;
1440          sem.RegisterPollMode = true;
1441          sem.SemaphoreDataDword = 0x0;
1442          sem.SemaphoreAddress =
1443             anv_address_from_u64(register_addr);
1444       }
1445    }
1446 #else
1447    assert(!device->info->has_aux_map);
1448 #endif
1449 }
1450 
1451 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1452 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1453                               struct anv_device *device,
1454                               uint32_t current_pipeline,
1455                               enum anv_pipe_bits bits,
1456                               enum anv_pipe_bits *emitted_flush_bits)
1457 {
1458 #if GFX_VER >= 12
1459    /* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
1460     *
1461     *     "SW must follow below programming restrictions when programming
1462     *      PIPE_CONTROL command [for ComputeCS]:
1463     *      ...
1464     *      Following bits must not be set when programmed for ComputeCS:
1465     *      - "Render Target Cache Flush Enable", "Depth Cache Flush Enable"
1466     *         and "Tile Cache Flush Enable"
1467     *      - "Depth Stall Enable", Stall at Pixel Scoreboard and
1468     *         "PSD Sync Enable".
1469     *      - "OVR Tile 0 Flush", "TBIMR Force Batch Closure",
1470     *         "AMFS Flush Enable", "VF Cache Invalidation Enable" and
1471     *         "Global Snapshot Count Reset"."
1472     *
1473     * XXX: According to spec this should not be a concern for a regular
1474     * RCS in GPGPU mode, but during testing it was found that at least
1475     * "VF Cache Invalidation Enable" bit is ignored in such case.
1476     * This can cause us to miss some important invalidations
1477     * (e.g. from CmdPipelineBarriers) and have incoherent data.
1478     *
1479     * There is also a Wa_1606932921 "RCS is not waking up fixed function clock
1480     * when specific 3d related bits are programmed in pipecontrol in
1481     * compute mode" that suggests us not to use "RT Cache Flush" in GPGPU mode.
1482     *
1483     * The other bits are not confirmed to cause problems, but included here
1484     * just to be safe, as they're also not really relevant in the GPGPU mode,
1485     * and having them doesn't seem to cause any regressions.
1486     *
1487     * So if we're currently in GPGPU mode, we hide some bits from
1488     * this flush, and will flush them only when we'll be able to.
1489     * Similar thing with GPGPU-only bits.
1490     */
1491    enum anv_pipe_bits defer_bits = bits &
1492       (current_pipeline == GPGPU ? ANV_PIPE_GFX_BITS: ANV_PIPE_GPGPU_BITS);
1493 
1494    bits &= ~defer_bits;
1495 #endif
1496 
1497    /*
1498     * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1499     *
1500     *    Write synchronization is a special case of end-of-pipe
1501     *    synchronization that requires that the render cache and/or depth
1502     *    related caches are flushed to memory, where the data will become
1503     *    globally visible. This type of synchronization is required prior to
1504     *    SW (CPU) actually reading the result data from memory, or initiating
1505     *    an operation that will use as a read surface (such as a texture
1506     *    surface) a previous render target and/or depth/stencil buffer
1507     *
1508     *
1509     * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1510     *
1511     *    Exercising the write cache flush bits (Render Target Cache Flush
1512     *    Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1513     *    ensures the write caches are flushed and doesn't guarantee the data
1514     *    is globally visible.
1515     *
1516     *    SW can track the completion of the end-of-pipe-synchronization by
1517     *    using "Notify Enable" and "PostSync Operation - Write Immediate
1518     *    Data" in the PIPE_CONTROL command.
1519     *
1520     * In other words, flushes are pipelined while invalidations are handled
1521     * immediately.  Therefore, if we're flushing anything then we need to
1522     * schedule an end-of-pipe sync before any invalidations can happen.
1523     */
1524    if (bits & ANV_PIPE_FLUSH_BITS)
1525       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1526 
1527 
1528    /* HSD 1209978178: docs say that before programming the aux table:
1529     *
1530     *    "Driver must ensure that the engine is IDLE but ensure it doesn't
1531     *    add extra flushes in the case it knows that the engine is already
1532     *    IDLE."
1533     *
1534     * HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1535     *
1536     *    "Render target Cache Flush + L3 Fabric Flush + State Invalidation + CS Stall"
1537     *
1538     * Notice we don't set the L3 Fabric Flush here, because we have
1539     * ANV_PIPE_END_OF_PIPE_SYNC_BIT which inserts a CS stall. The
1540     * PIPE_CONTROL::L3 Fabric Flush documentation says :
1541     *
1542     *    "L3 Fabric Flush will ensure all the pending transactions in the L3
1543     *     Fabric are flushed to global observation point. HW does implicit L3
1544     *     Fabric Flush on all stalling flushes (both explicit and implicit)
1545     *     and on PIPECONTROL having Post Sync Operation enabled."
1546     *
1547     * Therefore setting L3 Fabric Flush here would be redundant.
1548     */
1549    if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) {
1550       if (current_pipeline == GPGPU) {
1551          bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
1552                   ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1553                   (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1554       } else if (current_pipeline == _3D) {
1555          bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
1556                   ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1557                   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1558                   (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1559       }
1560    }
1561 
1562    /* If we're going to do an invalidate and we have a pending end-of-pipe
1563     * sync that has yet to be resolved, we do the end-of-pipe sync now.
1564     */
1565    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1566        (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1567       bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1568       bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1569 
1570       if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
1571          fputs("pc: add ", stderr);
1572          anv_dump_pipe_bits(ANV_PIPE_END_OF_PIPE_SYNC_BIT, stdout);
1573          fprintf(stderr, "reason: Ensure flushes done before invalidate\n");
1574       }
1575    }
1576 
1577    /* Project: SKL / Argument: LRI Post Sync Operation [23]
1578     *
1579     * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1580     *  programmed prior to programming a PIPECONTROL command with "LRI
1581     *  Post Sync Operation" in GPGPU mode of operation (i.e when
1582     *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
1583     *
1584     * The same text exists a few rows below for Post Sync Op.
1585     */
1586    if (bits & ANV_PIPE_POST_SYNC_BIT) {
1587       if (GFX_VER == 9 && current_pipeline == GPGPU)
1588          bits |= ANV_PIPE_CS_STALL_BIT;
1589       bits &= ~ANV_PIPE_POST_SYNC_BIT;
1590    }
1591 
1592    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1593                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1594       enum anv_pipe_bits flush_bits =
1595          bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1596                  ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1597 
1598 #if GFX_VERx10 >= 125
1599       if (current_pipeline != GPGPU) {
1600          if (flush_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
1601             flush_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
1602       } else {
1603          if (flush_bits & (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
1604                            ANV_PIPE_DATA_CACHE_FLUSH_BIT))
1605             flush_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
1606       }
1607 
1608       /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush:
1609        *
1610        *    "'HDC Pipeline Flush' bit must be set for this bit to take
1611        *     effect."
1612        */
1613       if (flush_bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
1614          flush_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
1615 #endif
1616 
1617 #if GFX_VER < 12
1618       if (flush_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
1619          flush_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
1620 #endif
1621 
1622       uint32_t sync_op = NoWrite;
1623       struct anv_address addr = ANV_NULL_ADDRESS;
1624 
1625       /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1626        *
1627        *    "The most common action to perform upon reaching a
1628        *    synchronization point is to write a value out to memory. An
1629        *    immediate value (included with the synchronization command) may
1630        *    be written."
1631        *
1632        *
1633        * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1634        *
1635        *    "In case the data flushed out by the render engine is to be
1636        *    read back in to the render engine in coherent manner, then the
1637        *    render engine has to wait for the fence completion before
1638        *    accessing the flushed data. This can be achieved by following
1639        *    means on various products: PIPE_CONTROL command with CS Stall
1640        *    and the required write caches flushed with Post-Sync-Operation
1641        *    as Write Immediate Data.
1642        *
1643        *    Example:
1644        *       - Workload-1 (3D/GPGPU/MEDIA)
1645        *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1646        *         Immediate Data, Required Write Cache Flush bits set)
1647        *       - Workload-2 (Can use the data produce or output by
1648        *         Workload-1)
1649        */
1650       if (flush_bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1651          flush_bits |= ANV_PIPE_CS_STALL_BIT;
1652          sync_op = WriteImmediateData;
1653          addr = device->workaround_address;
1654       }
1655 
1656       /* Flush PC. */
1657       genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1658                                          sync_op, addr, 0, flush_bits);
1659 
1660       /* If the caller wants to know what flushes have been emitted,
1661        * provide the bits based off the PIPE_CONTROL programmed bits.
1662        */
1663       if (emitted_flush_bits != NULL)
1664          *emitted_flush_bits = flush_bits;
1665 
1666       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1667                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1668    }
1669 
1670    if (bits & ANV_PIPE_INVALIDATE_BITS) {
1671       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1672        *
1673        *    "If the VF Cache Invalidation Enable is set to a 1 in a
1674        *    PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
1675        *    0, with the VF Cache Invalidation Enable set to 0 needs to be sent
1676        *    prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
1677        *    a 1."
1678        *
1679        * This appears to hang Broadwell, so we restrict it to just gfx9.
1680        */
1681       if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
1682          anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
1683 
1684 #if GFX_VER >= 9 && GFX_VER <= 11
1685       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1686        *
1687        *    "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
1688        *     always set for GPGPU workloads when “Texture Cache
1689        *     Invalidation Enable” bit is set".
1690        *
1691        * Workaround stopped appearing in TGL PRMs.
1692        */
1693       if (current_pipeline == GPGPU &&
1694           (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
1695          bits |= ANV_PIPE_CS_STALL_BIT;
1696 #endif
1697 
1698       uint32_t sync_op = NoWrite;
1699       struct anv_address addr = ANV_NULL_ADDRESS;
1700 
1701       /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1702        *
1703        *    "When VF Cache Invalidate is set “Post Sync Operation” must be
1704        *    enabled to “Write Immediate Data” or “Write PS Depth Count” or
1705        *    “Write Timestamp”.
1706        */
1707       if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1708          sync_op = WriteImmediateData;
1709          addr = device->workaround_address;
1710       }
1711 
1712       /* Invalidate PC. */
1713       genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1714                                          sync_op, addr, 0, bits);
1715 
1716       enum intel_engine_class engine_class =
1717          current_pipeline == GPGPU ? INTEL_ENGINE_CLASS_COMPUTE :
1718                                      INTEL_ENGINE_CLASS_RENDER;
1719       genX(invalidate_aux_map)(batch, device, engine_class, bits);
1720 
1721       bits &= ~ANV_PIPE_INVALIDATE_BITS;
1722    }
1723 
1724 #if GFX_VER >= 12
1725    bits |= defer_bits;
1726 #endif
1727 
1728    return bits;
1729 }
1730 
1731 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1732 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1733 {
1734 #if INTEL_NEEDS_WA_1508744258
1735    /* If we're changing the state of the RHWO optimization, we need to have
1736     * sb_stall+cs_stall.
1737     */
1738    const bool rhwo_opt_change =
1739       cmd_buffer->state.rhwo_optimization_enabled !=
1740       cmd_buffer->state.pending_rhwo_optimization_enabled;
1741    if (rhwo_opt_change) {
1742       anv_add_pending_pipe_bits(cmd_buffer,
1743                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
1744                                 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1745                                 "change RHWO optimization");
1746    }
1747 #endif
1748 
1749    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1750 
1751    if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1752       bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1753    else if (bits == 0)
1754       return;
1755 
1756    if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
1757        anv_cmd_buffer_is_video_queue(cmd_buffer)) {
1758       if (bits & ANV_PIPE_INVALIDATE_BITS) {
1759          genX(invalidate_aux_map)(&cmd_buffer->batch, cmd_buffer->device,
1760                                   cmd_buffer->queue_family->engine_class, bits);
1761          bits &= ~ANV_PIPE_INVALIDATE_BITS;
1762       }
1763       cmd_buffer->state.pending_pipe_bits = bits;
1764       return;
1765    }
1766 
1767    const bool trace_flush =
1768       (bits & (ANV_PIPE_FLUSH_BITS |
1769                ANV_PIPE_STALL_BITS |
1770                ANV_PIPE_INVALIDATE_BITS |
1771                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) != 0;
1772    if (trace_flush)
1773       trace_intel_begin_stall(&cmd_buffer->trace);
1774 
1775    if (GFX_VER == 9 &&
1776        (bits & ANV_PIPE_CS_STALL_BIT) &&
1777        (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1778       /* If we are doing a VF cache invalidate AND a CS stall (it must be
1779        * both) then we can reset our vertex cache tracking.
1780        */
1781       memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1782              sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1783       memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1784              sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1785    }
1786 
1787 
1788    enum anv_pipe_bits emitted_bits = 0;
1789    cmd_buffer->state.pending_pipe_bits =
1790       genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1791                                     cmd_buffer->device,
1792                                     cmd_buffer->state.current_pipeline,
1793                                     bits,
1794                                     &emitted_bits);
1795    anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits);
1796 
1797 #if INTEL_NEEDS_WA_1508744258
1798    if (rhwo_opt_change) {
1799       anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1800          c1.RCCRHWOOptimizationDisable =
1801             !cmd_buffer->state.pending_rhwo_optimization_enabled;
1802          c1.RCCRHWOOptimizationDisableMask = true;
1803       }
1804       cmd_buffer->state.rhwo_optimization_enabled =
1805          cmd_buffer->state.pending_rhwo_optimization_enabled;
1806    }
1807 #endif
1808 
1809    if (trace_flush) {
1810       trace_intel_end_stall(&cmd_buffer->trace,
1811                             bits & ~cmd_buffer->state.pending_pipe_bits,
1812                             anv_pipe_flush_bit_to_ds_stall_flag, NULL);
1813    }
1814 }
1815 
1816 static inline struct anv_state
emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1817 emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1818                                         struct anv_cmd_pipeline_state *pipe_state,
1819                                         struct anv_pipeline_binding *binding,
1820                                         const struct anv_descriptor *desc)
1821 {
1822    if (!desc->buffer)
1823       return anv_null_surface_state_for_binding_table(cmd_buffer->device);
1824 
1825    /* Compute the offset within the buffer */
1826    uint32_t dynamic_offset =
1827       pipe_state->dynamic_offsets[
1828          binding->set].offsets[binding->dynamic_offset_index];
1829    uint64_t offset = desc->offset + dynamic_offset;
1830    /* Clamp to the buffer size */
1831    offset = MIN2(offset, desc->buffer->vk.size);
1832    /* Clamp the range to the buffer size */
1833    uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
1834 
1835    /* Align the range for consistency */
1836    if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
1837       range = align(range, ANV_UBO_ALIGNMENT);
1838 
1839    struct anv_address address =
1840       anv_address_add(desc->buffer->address, offset);
1841 
1842    struct anv_state surface_state =
1843       anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
1844    if (surface_state.map == NULL)
1845       return ANV_STATE_NULL;
1846 
1847    enum isl_format format =
1848       anv_isl_format_for_descriptor_type(cmd_buffer->device,
1849                                          desc->type);
1850 
1851    isl_surf_usage_flags_t usage =
1852       anv_isl_usage_for_descriptor_type(desc->type);
1853 
1854    anv_fill_buffer_surface_state(cmd_buffer->device,
1855                                  surface_state.map,
1856                                  format, ISL_SWIZZLE_IDENTITY,
1857                                  usage, address, range, 1);
1858 
1859    return surface_state;
1860 }
1861 
1862 static uint32_t
emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1863 emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1864                                              struct anv_cmd_pipeline_state *pipe_state,
1865                                              struct anv_pipeline_binding *binding,
1866                                              const struct anv_descriptor *desc)
1867 {
1868    struct anv_device *device = cmd_buffer->device;
1869    struct anv_state surface_state;
1870 
1871    /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
1872     * Depending on where the descriptor surface state is allocated, they can
1873     * either come from device->internal_surface_state_pool or
1874     * device->bindless_surface_state_pool.
1875     */
1876    switch (desc->type) {
1877    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1878    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1879    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
1880       if (desc->image_view) {
1881          const struct anv_surface_state *sstate =
1882             anv_image_view_texture_surface_state(desc->image_view,
1883                                                  binding->plane,
1884                                                  desc->layout);
1885          surface_state = desc->image_view->use_surface_state_stream ?
1886             sstate->state :
1887             anv_bindless_state_for_binding_table(device, sstate->state);
1888          assert(surface_state.alloc_size);
1889       } else {
1890          surface_state = anv_null_surface_state_for_binding_table(device);
1891       }
1892       break;
1893    }
1894 
1895    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
1896       if (desc->image_view) {
1897          const struct anv_surface_state *sstate =
1898             anv_image_view_storage_surface_state(desc->image_view);
1899          surface_state = desc->image_view->use_surface_state_stream ?
1900             sstate->state :
1901             anv_bindless_state_for_binding_table(device, sstate->state);
1902          assert(surface_state.alloc_size);
1903       } else {
1904          surface_state =
1905             anv_null_surface_state_for_binding_table(device);
1906       }
1907       break;
1908    }
1909 
1910    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
1911    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
1912       if (desc->set_buffer_view) {
1913          surface_state = desc->set_buffer_view->general.state;
1914          assert(surface_state.alloc_size);
1915       } else {
1916          surface_state = anv_null_surface_state_for_binding_table(device);
1917       }
1918       break;
1919 
1920    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
1921       if (desc->buffer_view) {
1922          surface_state = anv_bindless_state_for_binding_table(
1923             device,
1924             desc->buffer_view->general.state);
1925          assert(surface_state.alloc_size);
1926       } else {
1927          surface_state = anv_null_surface_state_for_binding_table(device);
1928       }
1929       break;
1930 
1931    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
1932    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
1933       surface_state =
1934          emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
1935                                                  binding, desc);
1936       break;
1937 
1938    case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
1939       if (desc->buffer_view) {
1940          surface_state = anv_bindless_state_for_binding_table(
1941             device, desc->buffer_view->storage.state);
1942          assert(surface_state.alloc_size);
1943       } else {
1944          surface_state = anv_null_surface_state_for_binding_table(device);
1945       }
1946       break;
1947 
1948    default:
1949       unreachable("Invalid descriptor type");
1950    }
1951 
1952    return surface_state.offset;
1953 }
1954 
1955 static uint32_t
emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const struct anv_descriptor_set * set,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1956 emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1957                                            struct anv_cmd_pipeline_state *pipe_state,
1958                                            const struct anv_descriptor_set *set,
1959                                            struct anv_pipeline_binding *binding,
1960                                            const struct anv_descriptor *desc)
1961 {
1962    uint32_t desc_offset;
1963 
1964    /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
1965     * Depending on where the descriptor surface state is allocated, they can
1966     * either come from device->internal_surface_state_pool or
1967     * device->bindless_surface_state_pool.
1968     */
1969    switch (desc->type) {
1970    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1971    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1972    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
1973    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
1974    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
1975    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
1976    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
1977    case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
1978       desc_offset = set->desc_offset + binding->set_offset;
1979       break;
1980 
1981    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
1982    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
1983       struct anv_state state =
1984          emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
1985                                                  binding, desc);
1986       desc_offset = state.offset;
1987       break;
1988    }
1989 
1990    default:
1991       unreachable("Invalid descriptor type");
1992    }
1993 
1994    return desc_offset;
1995 }
1996 
1997 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)1998 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
1999                    struct anv_cmd_pipeline_state *pipe_state,
2000                    struct anv_shader_bin *shader,
2001                    struct anv_state *bt_state)
2002 {
2003    uint32_t state_offset;
2004 
2005    struct anv_pipeline_bind_map *map = &shader->bind_map;
2006    if (map->surface_count == 0) {
2007       *bt_state = (struct anv_state) { 0, };
2008       return VK_SUCCESS;
2009    }
2010 
2011    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2012                                                   map->surface_count,
2013                                                   &state_offset);
2014    uint32_t *bt_map = bt_state->map;
2015 
2016    if (bt_state->map == NULL)
2017       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2018 
2019    for (uint32_t s = 0; s < map->surface_count; s++) {
2020       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2021 
2022       struct anv_state surface_state;
2023 
2024       switch (binding->set) {
2025       case ANV_DESCRIPTOR_SET_NULL:
2026          bt_map[s] = 0;
2027          break;
2028 
2029       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2030          /* Color attachment binding */
2031          assert(shader->stage == MESA_SHADER_FRAGMENT);
2032          if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2033             const struct anv_attachment *att =
2034                &cmd_buffer->state.gfx.color_att[binding->index];
2035             surface_state = att->surface_state.state;
2036          } else {
2037             surface_state = cmd_buffer->state.gfx.null_surface_state;
2038          }
2039          assert(surface_state.map);
2040          bt_map[s] = surface_state.offset + state_offset;
2041          break;
2042 
2043       case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2044          /* This is always the first binding for compute shaders */
2045          assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2046 
2047          struct anv_state surface_state =
2048             anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2049          if (surface_state.map == NULL)
2050             return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2051 
2052          const enum isl_format format =
2053             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2054                                                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2055          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map,
2056                                        format, ISL_SWIZZLE_IDENTITY,
2057                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2058                                        cmd_buffer->state.compute.num_workgroups,
2059                                        12, 1);
2060 
2061          assert(surface_state.map);
2062          bt_map[s] = surface_state.offset + state_offset;
2063          break;
2064       }
2065 
2066       case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2067          struct anv_descriptor_set *set =
2068             pipe_state->descriptors[binding->index];
2069 
2070          /* If the shader doesn't access the set buffer, just put the null
2071           * surface.
2072           */
2073          if (set->is_push && !shader->push_desc_info.used_set_buffer) {
2074             bt_map[s] = 0;
2075             break;
2076          }
2077 
2078          /* This is a descriptor set buffer so the set index is actually
2079           * given by binding->binding.  (Yes, that's confusing.)
2080           */
2081          assert(set->desc_surface_mem.alloc_size);
2082          assert(set->desc_surface_state.alloc_size);
2083          bt_map[s] = set->desc_surface_state.offset + state_offset;
2084          add_surface_reloc(cmd_buffer, anv_descriptor_set_address(set));
2085          break;
2086       }
2087 
2088       default: {
2089          assert(binding->set < MAX_SETS);
2090          const struct anv_descriptor_set *set =
2091             pipe_state->descriptors[binding->set];
2092 
2093          if (binding->index >= set->descriptor_count) {
2094             /* From the Vulkan spec section entitled "DescriptorSet and
2095              * Binding Assignment":
2096              *
2097              *    "If the array is runtime-sized, then array elements greater
2098              *    than or equal to the size of that binding in the bound
2099              *    descriptor set must not be used."
2100              *
2101              * Unfortunately, the compiler isn't smart enough to figure out
2102              * when a dynamic binding isn't used so it may grab the whole
2103              * array and stick it in the binding table.  In this case, it's
2104              * safe to just skip those bindings that are OOB.
2105              */
2106             assert(binding->index < set->layout->descriptor_count);
2107             continue;
2108          }
2109 
2110          /* For push descriptor, if the binding is fully promoted to push
2111           * constants, just reference the null surface in the binding table.
2112           * It's unused and we didn't allocate/pack a surface state for it .
2113           */
2114          if (set->is_push) {
2115             uint32_t desc_idx = set->layout->binding[binding->binding].descriptor_index;
2116             assert(desc_idx < MAX_PUSH_DESCRIPTORS);
2117 
2118             if (shader->push_desc_info.fully_promoted_ubo_descriptors & BITFIELD_BIT(desc_idx)) {
2119                surface_state =
2120                   anv_null_surface_state_for_binding_table(cmd_buffer->device);
2121                break;
2122             }
2123          }
2124 
2125          const struct anv_descriptor *desc = &set->descriptors[binding->index];
2126          if (desc->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR ||
2127              desc->type == VK_DESCRIPTOR_TYPE_SAMPLER) {
2128             /* Nothing for us to do here */
2129             continue;
2130          }
2131 
2132          const struct anv_pipeline *pipeline = pipe_state->pipeline;
2133          uint32_t surface_state_offset;
2134          if (pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
2135             surface_state_offset =
2136                emit_indirect_descriptor_binding_table_entry(cmd_buffer,
2137                                                             pipe_state,
2138                                                             binding, desc);
2139          } else {
2140             surface_state_offset =
2141                emit_direct_descriptor_binding_table_entry(cmd_buffer, pipe_state,
2142                                                           set, binding, desc);
2143          }
2144 
2145          bt_map[s] = surface_state_offset + state_offset;
2146          break;
2147       }
2148       }
2149    }
2150 
2151    return VK_SUCCESS;
2152 }
2153 
2154 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2155 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2156               struct anv_cmd_pipeline_state *pipe_state,
2157               struct anv_shader_bin *shader,
2158               struct anv_state *state)
2159 {
2160    struct anv_pipeline_bind_map *map = &shader->bind_map;
2161    if (map->sampler_count == 0) {
2162       *state = (struct anv_state) { 0, };
2163       return VK_SUCCESS;
2164    }
2165 
2166    uint32_t size = map->sampler_count * 16;
2167    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2168 
2169    if (state->map == NULL)
2170       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2171 
2172    for (uint32_t s = 0; s < map->sampler_count; s++) {
2173       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2174       const struct anv_descriptor *desc =
2175          &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2176 
2177       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2178           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2179          continue;
2180 
2181       struct anv_sampler *sampler = desc->sampler;
2182 
2183       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2184        * happens to be zero.
2185        */
2186       if (sampler == NULL)
2187          continue;
2188 
2189       memcpy(state->map + (s * 16),
2190              sampler->state[binding->plane], sizeof(sampler->state[0]));
2191    }
2192 
2193    return VK_SUCCESS;
2194 }
2195 
2196 uint32_t
genX(cmd_buffer_flush_descriptor_sets)2197 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
2198                                        struct anv_cmd_pipeline_state *pipe_state,
2199                                        const VkShaderStageFlags dirty,
2200                                        struct anv_shader_bin **shaders,
2201                                        uint32_t num_shaders)
2202 {
2203    VkShaderStageFlags flushed = 0;
2204 
2205    VkResult result = VK_SUCCESS;
2206    for (uint32_t i = 0; i < num_shaders; i++) {
2207       if (!shaders[i])
2208          continue;
2209 
2210       gl_shader_stage stage = shaders[i]->stage;
2211       VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2212       if ((vk_stage & dirty) == 0)
2213          continue;
2214 
2215       assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2216       result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2217                              &cmd_buffer->state.samplers[stage]);
2218       if (result != VK_SUCCESS)
2219          break;
2220 
2221       assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2222       result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2223                                   &cmd_buffer->state.binding_tables[stage]);
2224       if (result != VK_SUCCESS)
2225          break;
2226 
2227       flushed |= vk_stage;
2228    }
2229 
2230    if (result != VK_SUCCESS) {
2231       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2232 
2233       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2234       if (result != VK_SUCCESS)
2235          return 0;
2236 
2237       /* Re-emit state base addresses so we get the new surface state base
2238        * address before we start emitting binding tables etc.
2239        */
2240       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2241 
2242       /* Re-emit all active binding tables */
2243       flushed = 0;
2244 
2245       for (uint32_t i = 0; i < num_shaders; i++) {
2246          if (!shaders[i])
2247             continue;
2248 
2249          gl_shader_stage stage = shaders[i]->stage;
2250 
2251          result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2252                                 &cmd_buffer->state.samplers[stage]);
2253          if (result != VK_SUCCESS) {
2254             anv_batch_set_error(&cmd_buffer->batch, result);
2255             return 0;
2256          }
2257          result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2258                                      &cmd_buffer->state.binding_tables[stage]);
2259          if (result != VK_SUCCESS) {
2260             anv_batch_set_error(&cmd_buffer->batch, result);
2261             return 0;
2262          }
2263 
2264          flushed |= mesa_to_vk_shader_stage(stage);
2265       }
2266    }
2267 
2268    return flushed;
2269 }
2270 
2271 /* This function generates the surface state used to read the content of the
2272  * descriptor buffer.
2273  */
2274 void
genX(cmd_buffer_emit_push_descriptor_buffer_surface)2275 genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
2276                                                      struct anv_descriptor_set *set)
2277 {
2278    assert(set->desc_surface_state.map == NULL);
2279 
2280    struct anv_descriptor_set_layout *layout = set->layout;
2281    enum isl_format format =
2282       anv_isl_format_for_descriptor_type(cmd_buffer->device,
2283                                          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2284 
2285    set->desc_surface_state =
2286       anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2287    if (set->desc_surface_state.map == NULL)
2288       return;
2289    anv_fill_buffer_surface_state(cmd_buffer->device,
2290                                  set->desc_surface_state.map,
2291                                  format, ISL_SWIZZLE_IDENTITY,
2292                                  ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2293                                  set->desc_surface_addr,
2294                                  layout->descriptor_buffer_surface_size, 1);
2295 }
2296 
2297 /* This functions generates surface states used by a pipeline for push
2298  * descriptors. This is delayed to the draw/dispatch time to avoid allocation
2299  * and surface state generation when a pipeline is not going to use the
2300  * binding table to access any push descriptor data.
2301  */
2302 void
genX(cmd_buffer_emit_push_descriptor_surfaces)2303 genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
2304                                                struct anv_descriptor_set *set)
2305 {
2306    while (set->generate_surface_states) {
2307       int desc_idx = u_bit_scan(&set->generate_surface_states);
2308       struct anv_descriptor *desc = &set->descriptors[desc_idx];
2309       struct anv_buffer_view *bview = desc->set_buffer_view;
2310 
2311       if (bview != NULL && bview->general.state.map == NULL) {
2312          bview->general.state =
2313             anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2314          if (bview->general.state.map == NULL)
2315             return;
2316          anv_descriptor_write_surface_state(cmd_buffer->device, desc,
2317                                             bview->general.state);
2318       }
2319    }
2320 }
2321 
2322 ALWAYS_INLINE void
genX(batch_emit_pipe_control)2323 genX(batch_emit_pipe_control)(struct anv_batch *batch,
2324                               const struct intel_device_info *devinfo,
2325                               uint32_t current_pipeline,
2326                               enum anv_pipe_bits bits,
2327                               const char *reason)
2328 {
2329    genX(batch_emit_pipe_control_write)(batch,
2330                                        devinfo,
2331                                        current_pipeline,
2332                                        NoWrite,
2333                                        ANV_NULL_ADDRESS,
2334                                        0,
2335                                        bits,
2336                                        reason);
2337 }
2338 
2339 ALWAYS_INLINE void
genX(batch_emit_pipe_control_write)2340 genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
2341                                     const struct intel_device_info *devinfo,
2342                                     uint32_t current_pipeline,
2343                                     uint32_t post_sync_op,
2344                                     struct anv_address address,
2345                                     uint32_t imm_data,
2346                                     enum anv_pipe_bits bits,
2347                                     const char *reason)
2348 {
2349    if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
2350        (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
2351       unreachable("Trying to emit unsupported PIPE_CONTROL command.");
2352 
2353    /* XXX - insert all workarounds and GFX specific things below. */
2354 
2355    /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
2356     * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
2357     * with CS_STALL Bit set (with No POST_SYNC ENABLED)
2358     */
2359    if (intel_device_info_is_adln(devinfo) &&
2360        current_pipeline == GPGPU &&
2361        post_sync_op != NoWrite) {
2362       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2363          pipe.CommandStreamerStallEnable = true;
2364          anv_debug_dump_pc(pipe, "Wa_14014966230");
2365       };
2366    }
2367 
2368 #if INTEL_NEEDS_WA_1409600907
2369    /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2370     * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2371     */
2372    if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
2373       bits |= ANV_PIPE_DEPTH_STALL_BIT;
2374 #endif
2375 
2376    anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2377 #if GFX_VERx10 >= 125
2378       pipe.UntypedDataPortCacheFlushEnable =
2379          bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2380       pipe.CCSFlushEnable = bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT;
2381 #endif
2382 #if GFX_VER == 12
2383       pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2384 #endif
2385 #if GFX_VER > 11
2386       pipe.HDCPipelineFlushEnable = bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2387 #endif
2388       pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2389       pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2390       pipe.RenderTargetCacheFlushEnable =
2391          bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2392 
2393       pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2394 
2395 #if GFX_VERx10 >= 125
2396       pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2397 #endif
2398       pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2399       pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2400 
2401       pipe.StateCacheInvalidationEnable =
2402          bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2403       pipe.ConstantCacheInvalidationEnable =
2404          bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2405 #if GFX_VER >= 12
2406       /* Invalidates the L3 cache part in which index & vertex data is loaded
2407        * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2408        */
2409       pipe.L3ReadOnlyCacheInvalidationEnable =
2410          bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2411 #endif
2412       pipe.VFCacheInvalidationEnable =
2413          bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2414       pipe.TextureCacheInvalidationEnable =
2415          bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2416       pipe.InstructionCacheInvalidateEnable =
2417          bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2418 
2419       pipe.PostSyncOperation = post_sync_op;
2420       pipe.Address = address;
2421       pipe.DestinationAddressType = DAT_PPGTT;
2422       pipe.ImmediateData = imm_data;
2423 
2424       anv_debug_dump_pc(pipe, reason);
2425    }
2426 }
2427 
2428 /* Set preemption on/off. */
2429 void
genX(batch_set_preemption)2430 genX(batch_set_preemption)(struct anv_batch *batch,
2431                            const struct intel_device_info *devinfo,
2432                            uint32_t current_pipeline,
2433                            bool value)
2434 {
2435 #if GFX_VERx10 >= 120
2436    anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
2437       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !value;
2438       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
2439    }
2440 
2441    /* Wa_16013994831 - we need to insert CS_STALL and 250 noops. */
2442    genx_batch_emit_pipe_control(batch, devinfo, current_pipeline,
2443                                 ANV_PIPE_CS_STALL_BIT);
2444 
2445    for (unsigned i = 0; i < 250; i++)
2446       anv_batch_emit(batch, GENX(MI_NOOP), noop);
2447 #endif
2448 }
2449 
2450 void
genX(cmd_buffer_set_preemption)2451 genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value)
2452 {
2453 #if GFX_VERx10 >= 120
2454    if (cmd_buffer->state.gfx.object_preemption == value)
2455       return;
2456 
2457    genX(batch_set_preemption)(&cmd_buffer->batch, cmd_buffer->device->info,
2458                               cmd_buffer->state.current_pipeline,
2459                               value);
2460    cmd_buffer->state.gfx.object_preemption = value;
2461 #endif
2462 }
2463 
2464 VkResult
genX(BeginCommandBuffer)2465 genX(BeginCommandBuffer)(
2466     VkCommandBuffer                             commandBuffer,
2467     const VkCommandBufferBeginInfo*             pBeginInfo)
2468 {
2469    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2470    VkResult result;
2471 
2472    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
2473     * command buffer's state. Otherwise, we must *reset* its state. In both
2474     * cases we reset it.
2475     *
2476     * From the Vulkan 1.0 spec:
2477     *
2478     *    If a command buffer is in the executable state and the command buffer
2479     *    was allocated from a command pool with the
2480     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
2481     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
2482     *    as if vkResetCommandBuffer had been called with
2483     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
2484     *    the command buffer in the recording state.
2485     */
2486    anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
2487    anv_cmd_buffer_reset_rendering(cmd_buffer);
2488 
2489    cmd_buffer->usage_flags = pBeginInfo->flags;
2490 
2491    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
2492     * primary level command buffers.
2493     *
2494     * From the Vulkan 1.0 spec:
2495     *
2496     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
2497     *    secondary command buffer is considered to be entirely inside a render
2498     *    pass. If this is a primary command buffer, then this bit is ignored.
2499     */
2500    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
2501       cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2502 
2503 #if GFX_VER >= 12
2504    /* Reenable prefetching at the beginning of secondary command buffers. We
2505     * do this so that the return instruction edition is not prefetched before
2506     * completion.
2507     */
2508    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2509       anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
2510          arb.PreParserDisableMask = true;
2511          arb.PreParserDisable = false;
2512       }
2513    }
2514 #endif
2515 
2516    /* Assume the viewport has already been set in primary command buffers. */
2517    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
2518       cmd_buffer->state.gfx.viewport_set = true;
2519 
2520    trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
2521 
2522    if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
2523        anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
2524       /* Re-emit the aux table register in every command buffer.  This way we're
2525        * ensured that we have the table even if this command buffer doesn't
2526        * initialize any images.
2527        */
2528       if (cmd_buffer->device->info->has_aux_map) {
2529          anv_add_pending_pipe_bits(cmd_buffer,
2530                                    ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
2531                                    "new cmd buffer with aux-tt");
2532       }
2533       return VK_SUCCESS;
2534    }
2535 
2536 #if GFX_VER >= 12
2537    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
2538        cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) {
2539       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SET_APPID), appid) {
2540          /* Default value for single session. */
2541          appid.ProtectedMemoryApplicationID = cmd_buffer->device->protected_session_id;
2542          appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
2543       }
2544       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2545          pc.CommandStreamerStallEnable = true;
2546          pc.DCFlushEnable = true;
2547          pc.RenderTargetCacheFlushEnable = true;
2548          pc.ProtectedMemoryEnable = true;
2549       }
2550    }
2551 #endif
2552 
2553    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2554 
2555    /* We sometimes store vertex data in the dynamic state buffer for blorp
2556     * operations and our dynamic state stream may re-use data from previous
2557     * command buffers.  In order to prevent stale cache data, we flush the VF
2558     * cache.  We could do this on every blorp call but that's not really
2559     * needed as all of the data will get written by the CPU prior to the GPU
2560     * executing anything.  The chances are fairly high that they will use
2561     * blorp at least once per primary command buffer so it shouldn't be
2562     * wasted.
2563     *
2564     * There is also a workaround on gfx8 which requires us to invalidate the
2565     * VF cache occasionally.  It's easier if we can assume we start with a
2566     * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
2567     */
2568    anv_add_pending_pipe_bits(cmd_buffer,
2569                              ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
2570                              "new cmd buffer");
2571 
2572    /* Re-emit the aux table register in every command buffer.  This way we're
2573     * ensured that we have the table even if this command buffer doesn't
2574     * initialize any images.
2575     */
2576    if (cmd_buffer->device->info->has_aux_map) {
2577       anv_add_pending_pipe_bits(cmd_buffer,
2578                                 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
2579                                 "new cmd buffer with aux-tt");
2580    }
2581 
2582    /* We send an "Indirect State Pointers Disable" packet at
2583     * EndCommandBuffer, so all push constant packets are ignored during a
2584     * context restore. Documentation says after that command, we need to
2585     * emit push constants again before any rendering operation. So we
2586     * flag them dirty here to make sure they get emitted.
2587     */
2588    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2589 
2590    if (cmd_buffer->usage_flags &
2591        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2592       struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2593 
2594       char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
2595       const VkRenderingInfo *resume_info =
2596          vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
2597                                                                pBeginInfo,
2598                                                                gcbiar_data);
2599       if (resume_info != NULL) {
2600          genX(CmdBeginRendering)(commandBuffer, resume_info);
2601       } else {
2602          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
2603             vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
2604                                                              pBeginInfo);
2605          assert(inheritance_info);
2606 
2607          gfx->rendering_flags = inheritance_info->flags;
2608          gfx->render_area = (VkRect2D) { };
2609          gfx->layer_count = 0;
2610          gfx->samples = inheritance_info->rasterizationSamples;
2611          gfx->view_mask = inheritance_info->viewMask;
2612 
2613          uint32_t color_att_count = inheritance_info->colorAttachmentCount;
2614          result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
2615          if (result != VK_SUCCESS)
2616             return result;
2617 
2618          for (uint32_t i = 0; i < color_att_count; i++) {
2619             gfx->color_att[i].vk_format =
2620                inheritance_info->pColorAttachmentFormats[i];
2621          }
2622          gfx->depth_att.vk_format =
2623             inheritance_info->depthAttachmentFormat;
2624          gfx->stencil_att.vk_format =
2625             inheritance_info->stencilAttachmentFormat;
2626 
2627          anv_cmd_graphic_state_update_has_uint_rt(gfx);
2628 
2629          cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_AREA |
2630                                         ANV_CMD_DIRTY_RENDER_TARGETS;
2631       }
2632    }
2633 
2634    /* Emit the sample pattern at the beginning of the batch because the
2635     * default locations emitted at the device initialization might have been
2636     * changed by a previous command buffer.
2637     *
2638     * Do not change that when we're continuing a previous renderpass.
2639     */
2640    if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
2641        !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
2642       genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
2643 
2644    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2645       const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
2646          vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
2647 
2648       /* If secondary buffer supports conditional rendering
2649        * we should emit commands as if conditional rendering is enabled.
2650        */
2651       cmd_buffer->state.conditional_render_enabled =
2652          conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
2653 
2654       if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable) {
2655          cmd_buffer->state.gfx.n_occlusion_queries = 1;
2656          cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
2657       }
2658    }
2659 
2660    return VK_SUCCESS;
2661 }
2662 
2663 /* From the PRM, Volume 2a:
2664  *
2665  *    "Indirect State Pointers Disable
2666  *
2667  *    At the completion of the post-sync operation associated with this pipe
2668  *    control packet, the indirect state pointers in the hardware are
2669  *    considered invalid; the indirect pointers are not saved in the context.
2670  *    If any new indirect state commands are executed in the command stream
2671  *    while the pipe control is pending, the new indirect state commands are
2672  *    preserved.
2673  *
2674  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
2675  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
2676  *    commands are only considered as Indirect State Pointers. Once ISP is
2677  *    issued in a context, SW must initialize by programming push constant
2678  *    commands for all the shaders (at least to zero length) before attempting
2679  *    any rendering operation for the same context."
2680  *
2681  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
2682  * even though they point to a BO that has been already unreferenced at
2683  * the end of the previous batch buffer. This has been fine so far since
2684  * we are protected by these scratch page (every address not covered by
2685  * a BO should be pointing to the scratch page). But on CNL, it is
2686  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
2687  * instruction.
2688  *
2689  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
2690  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
2691  * context restore, so the mentioned hang doesn't happen. However,
2692  * software must program push constant commands for all stages prior to
2693  * rendering anything. So we flag them dirty in BeginCommandBuffer.
2694  *
2695  * Finally, we also make sure to stall at pixel scoreboard to make sure the
2696  * constants have been loaded into the EUs prior to disable the push constants
2697  * so that it doesn't hang a previous 3DPRIMITIVE.
2698  */
2699 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)2700 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
2701 {
2702    genx_batch_emit_pipe_control(&cmd_buffer->batch,
2703                                 cmd_buffer->device->info,
2704                                 cmd_buffer->state.current_pipeline,
2705                                 ANV_PIPE_CS_STALL_BIT |
2706                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
2707    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2708          pc.IndirectStatePointersDisable = true;
2709          pc.CommandStreamerStallEnable = true;
2710          anv_debug_dump_pc(pc, __func__);
2711    }
2712 }
2713 
2714 static VkResult
end_command_buffer(struct anv_cmd_buffer * cmd_buffer)2715 end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
2716 {
2717    if (anv_batch_has_error(&cmd_buffer->batch))
2718       return cmd_buffer->batch.status;
2719 
2720    anv_measure_endcommandbuffer(cmd_buffer);
2721 
2722    if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
2723        anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
2724       trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
2725       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2726       anv_cmd_buffer_end_batch_buffer(cmd_buffer);
2727       return VK_SUCCESS;
2728    }
2729 
2730    /* Flush query clears using blorp so that secondary query writes do not
2731     * race with the clear.
2732     */
2733    if (cmd_buffer->state.queries.clear_bits) {
2734       anv_add_pending_pipe_bits(cmd_buffer,
2735                                 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
2736                                 "query clear flush prior command buffer end");
2737    }
2738 
2739    genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
2740 
2741    /* Turn on object level preemption if it is disabled to have it in known
2742     * state at the beginning of new command buffer.
2743     */
2744    if (!cmd_buffer->state.gfx.object_preemption)
2745       genX(cmd_buffer_set_preemption)(cmd_buffer, true);
2746 
2747    /* We want every command buffer to start with the PMA fix in a known state,
2748     * so we disable it at the end of the command buffer.
2749     */
2750    genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
2751 
2752    /* Wa_14015814527
2753     *
2754     * Apply task URB workaround in the end of primary or secondary cmd_buffer.
2755     */
2756    genX(apply_task_urb_workaround)(cmd_buffer);
2757 
2758    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2759 
2760    emit_isp_disable(cmd_buffer);
2761 
2762 #if GFX_VER >= 12
2763    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
2764        cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) {
2765       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2766          pc.CommandStreamerStallEnable = true;
2767          pc.DCFlushEnable = true;
2768          pc.RenderTargetCacheFlushEnable = true;
2769          pc.ProtectedMemoryDisable = true;
2770       }
2771    }
2772 #endif
2773 
2774    trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
2775 
2776    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
2777 
2778    return VK_SUCCESS;
2779 }
2780 
2781 VkResult
genX(EndCommandBuffer)2782 genX(EndCommandBuffer)(
2783     VkCommandBuffer                             commandBuffer)
2784 {
2785    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2786 
2787    VkResult status = end_command_buffer(cmd_buffer);
2788    if (status != VK_SUCCESS)
2789       return status;
2790 
2791    /* If there is MSAA access over the compute/transfer queue, we can use the
2792     * companion RCS command buffer and end it properly.
2793     */
2794    if (cmd_buffer->companion_rcs_cmd_buffer) {
2795        assert(anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
2796               anv_cmd_buffer_is_blitter_queue(cmd_buffer));
2797        status = end_command_buffer(cmd_buffer->companion_rcs_cmd_buffer);
2798    }
2799 
2800    ANV_RMV(cmd_buffer_create, cmd_buffer->device, cmd_buffer);
2801 
2802    return status;
2803 }
2804 
2805 static void
cmd_buffer_emit_copy_ts_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint32_t from_offset,void * ts_to,uint32_t to_offset,uint32_t count)2806 cmd_buffer_emit_copy_ts_buffer(struct u_trace_context *utctx,
2807                                void *cmdstream,
2808                                void *ts_from, uint32_t from_offset,
2809                                void *ts_to, uint32_t to_offset,
2810                                uint32_t count)
2811 {
2812    struct anv_memcpy_state *memcpy_state = cmdstream;
2813    struct anv_address from_addr = (struct anv_address) {
2814       .bo = ts_from, .offset = from_offset * sizeof(uint64_t) };
2815    struct anv_address to_addr = (struct anv_address) {
2816       .bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
2817 
2818    genX(emit_so_memcpy)(memcpy_state, to_addr, from_addr,
2819                         count * sizeof(uint64_t));
2820 }
2821 
2822 void
genX(CmdExecuteCommands)2823 genX(CmdExecuteCommands)(
2824     VkCommandBuffer                             commandBuffer,
2825     uint32_t                                    commandBufferCount,
2826     const VkCommandBuffer*                      pCmdBuffers)
2827 {
2828    ANV_FROM_HANDLE(anv_cmd_buffer, container, commandBuffer);
2829 
2830    struct anv_device *device = container->device;
2831 
2832    if (anv_batch_has_error(&container->batch))
2833       return;
2834 
2835    /* The secondary command buffers will assume that the PMA fix is disabled
2836     * when they begin executing.  Make sure this is true.
2837     */
2838    genX(cmd_buffer_enable_pma_fix)(container, false);
2839 
2840    /* Turn on preemption in case it was toggled off. */
2841    if (!container->state.gfx.object_preemption)
2842       genX(cmd_buffer_set_preemption)(container, true);
2843 
2844    /* Wa_14015814527
2845     *
2846     * Apply task URB workaround before secondary cmd buffers.
2847     */
2848    genX(apply_task_urb_workaround)(container);
2849 
2850    /* Flush query clears using blorp so that secondary query writes do not
2851     * race with the clear.
2852     */
2853    if (container->state.queries.clear_bits) {
2854       anv_add_pending_pipe_bits(container,
2855                                 ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits),
2856                                 "query clear flush prior to secondary buffer");
2857    }
2858 
2859    /* The secondary command buffer doesn't know which textures etc. have been
2860     * flushed prior to their execution.  Apply those flushes now.
2861     */
2862    genX(cmd_buffer_apply_pipe_flushes)(container);
2863 
2864    genX(cmd_buffer_flush_generated_draws)(container);
2865 
2866    for (uint32_t i = 0; i < commandBufferCount; i++) {
2867       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
2868 
2869       assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
2870       assert(!anv_batch_has_error(&secondary->batch));
2871 
2872       if (secondary->state.conditional_render_enabled) {
2873          if (!container->state.conditional_render_enabled) {
2874             /* Secondary buffer is constructed as if it will be executed
2875              * with conditional rendering, we should satisfy this dependency
2876              * regardless of conditional rendering being enabled in container.
2877              */
2878             struct mi_builder b;
2879             mi_builder_init(&b, device->info, &container->batch);
2880             mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
2881                          mi_imm(UINT64_MAX));
2882          }
2883       }
2884 
2885       if (secondary->usage_flags &
2886           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2887          /* If we're continuing a render pass from the container, we need to
2888           * copy the surface states for the current subpass into the storage
2889           * we allocated for them in BeginCommandBuffer.
2890           */
2891          struct anv_state src_state = container->state.gfx.att_states;
2892          struct anv_state dst_state = secondary->state.gfx.att_states;
2893          assert(src_state.alloc_size == dst_state.alloc_size);
2894 
2895          genX(cmd_buffer_so_memcpy)(
2896             container,
2897             anv_state_pool_state_address(&device->internal_surface_state_pool,
2898                                          dst_state),
2899             anv_state_pool_state_address(&device->internal_surface_state_pool,
2900                                          src_state),
2901             src_state.alloc_size);
2902       }
2903 
2904       anv_cmd_buffer_add_secondary(container, secondary);
2905 
2906       /* Add secondary buffer's RCS command buffer to container buffer's RCS
2907        * command buffer for execution if secondary RCS is valid.
2908        */
2909       if (secondary->companion_rcs_cmd_buffer != NULL) {
2910          VkResult result = anv_cmd_buffer_ensure_rcs_companion(container);
2911          if (result != VK_SUCCESS) {
2912             anv_batch_set_error(&container->batch, result);
2913             return;
2914          }
2915 
2916          anv_cmd_buffer_add_secondary(container->companion_rcs_cmd_buffer,
2917                                       secondary->companion_rcs_cmd_buffer);
2918       }
2919 
2920       assert(secondary->perf_query_pool == NULL || container->perf_query_pool == NULL ||
2921              secondary->perf_query_pool == container->perf_query_pool);
2922       if (secondary->perf_query_pool)
2923          container->perf_query_pool = secondary->perf_query_pool;
2924 
2925 #if INTEL_NEEDS_WA_1808121037
2926       if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
2927          container->state.depth_reg_mode = secondary->state.depth_reg_mode;
2928 #endif
2929 
2930       container->state.gfx.viewport_set |= secondary->state.gfx.viewport_set;
2931    }
2932 
2933    /* The secondary isn't counted in our VF cache tracking so we need to
2934     * invalidate the whole thing.
2935     */
2936    if (GFX_VER == 9) {
2937       anv_add_pending_pipe_bits(container,
2938                                 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
2939                                 "Secondary cmd buffer not tracked in VF cache");
2940    }
2941 
2942 #if INTEL_WA_16014538804_GFX_VER
2943    if (anv_cmd_buffer_is_render_queue(container) &&
2944        intel_needs_workaround(device->info, 16014538804))
2945       anv_batch_emit(&container->batch, GENX(PIPE_CONTROL), pc);
2946 #endif
2947 
2948    /* The secondary may have selected a different pipeline (3D or compute) and
2949     * may have changed the current L3$ configuration.  Reset our tracking
2950     * variables to invalid values to ensure that we re-emit these in the case
2951     * where we do any draws or compute dispatches from the container after the
2952     * secondary has returned.
2953     */
2954    container->state.current_pipeline = UINT32_MAX;
2955    container->state.current_l3_config = NULL;
2956    container->state.current_hash_scale = 0;
2957    container->state.gfx.push_constant_stages = 0;
2958    container->state.gfx.ds_write_state = false;
2959    memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config));
2960    memcpy(container->state.gfx.dyn_state.dirty,
2961           device->gfx_dirty_state,
2962           sizeof(container->state.gfx.dyn_state.dirty));
2963 
2964    /* Each of the secondary command buffers will use its own state base
2965     * address.  We need to re-emit state base address for the container after
2966     * all of the secondaries are done.
2967     *
2968     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
2969     * address calls?
2970     */
2971    genX(cmd_buffer_emit_state_base_address)(container);
2972 
2973    /* Copy of utrace timestamp buffers from secondary into container */
2974    if (u_trace_enabled(&device->ds.trace_context)) {
2975       trace_intel_begin_trace_copy(&container->trace);
2976 
2977       struct anv_memcpy_state memcpy_state;
2978       genX(emit_so_memcpy_init)(&memcpy_state, device, &container->batch);
2979       uint32_t num_traces = 0;
2980       for (uint32_t i = 0; i < commandBufferCount; i++) {
2981          ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
2982 
2983          num_traces += secondary->trace.num_traces;
2984          u_trace_clone_append(u_trace_begin_iterator(&secondary->trace),
2985                               u_trace_end_iterator(&secondary->trace),
2986                               &container->trace,
2987                               &memcpy_state,
2988                               cmd_buffer_emit_copy_ts_buffer);
2989       }
2990       genX(emit_so_memcpy_fini)(&memcpy_state);
2991 
2992       trace_intel_end_trace_copy(&container->trace, num_traces);
2993 
2994       /* Memcpy is done using the 3D pipeline. */
2995       container->state.current_pipeline = _3D;
2996    }
2997 }
2998 
2999 static inline enum anv_pipe_bits
anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3000 anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3001                                      VkAccessFlags2 flags)
3002 {
3003    enum anv_pipe_bits pipe_bits = 0;
3004 
3005    u_foreach_bit64(b, flags) {
3006       switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3007       case VK_ACCESS_2_SHADER_WRITE_BIT:
3008       case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
3009       case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3010          /* We're transitioning a buffer that was previously used as write
3011           * destination through the data port. To make its content available
3012           * to future operations, flush the hdc pipeline.
3013           */
3014          pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3015          pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3016          break;
3017       case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
3018          /* We're transitioning a buffer that was previously used as render
3019           * target. To make its content available to future operations, flush
3020           * the render target cache.
3021           */
3022          pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3023          break;
3024       case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3025          /* We're transitioning a buffer that was previously used as depth
3026           * buffer. To make its content available to future operations, flush
3027           * the depth cache.
3028           */
3029          pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3030          break;
3031       case VK_ACCESS_2_TRANSFER_WRITE_BIT:
3032          /* We're transitioning a buffer that was previously used as a
3033           * transfer write destination. Generic write operations include color
3034           * & depth operations as well as buffer operations like :
3035           *     - vkCmdClearColorImage()
3036           *     - vkCmdClearDepthStencilImage()
3037           *     - vkCmdBlitImage()
3038           *     - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
3039           *
3040           * Most of these operations are implemented using Blorp which writes
3041           * through the render target cache or the depth cache on the graphics
3042           * queue. On the compute queue, the writes are done through the data
3043           * port.
3044           */
3045          if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
3046             pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3047             pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3048          } else {
3049             pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3050             pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3051          }
3052          break;
3053       case VK_ACCESS_2_MEMORY_WRITE_BIT:
3054          /* We're transitioning a buffer for generic write operations. Flush
3055           * all the caches.
3056           */
3057          pipe_bits |= ANV_PIPE_FLUSH_BITS;
3058          break;
3059       case VK_ACCESS_2_HOST_WRITE_BIT:
3060          /* We're transitioning a buffer for access by CPU. Invalidate
3061           * all the caches. Since data and tile caches don't have invalidate,
3062           * we are forced to flush those as well.
3063           */
3064          pipe_bits |= ANV_PIPE_FLUSH_BITS;
3065          pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3066          break;
3067       case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3068       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3069          /* We're transitioning a buffer written either from VS stage or from
3070           * the command streamer (see CmdEndTransformFeedbackEXT), we just
3071           * need to stall the CS.
3072           *
3073           * Streamout writes apparently bypassing L3, in order to make them
3074           * visible to the destination, we need to invalidate the other
3075           * caches.
3076           */
3077          pipe_bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_INVALIDATE_BITS;
3078          break;
3079       default:
3080          break; /* Nothing to do */
3081       }
3082    }
3083 
3084    return pipe_bits;
3085 }
3086 
3087 static inline enum anv_pipe_bits
anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3088 anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3089                                           VkAccessFlags2 flags)
3090 {
3091    struct anv_device *device = cmd_buffer->device;
3092    enum anv_pipe_bits pipe_bits = 0;
3093 
3094    u_foreach_bit64(b, flags) {
3095       switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3096       case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
3097          /* Indirect draw commands take a buffer as input that we're going to
3098           * read from the command streamer to load some of the HW registers
3099           * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
3100           * command streamer stall so that all the cache flushes have
3101           * completed before the command streamer loads from memory.
3102           */
3103          pipe_bits |=  ANV_PIPE_CS_STALL_BIT;
3104          /* Indirect draw commands also set gl_BaseVertex & gl_BaseIndex
3105           * through a vertex buffer, so invalidate that cache.
3106           */
3107          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3108          /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
3109           * UBO from the buffer, so we need to invalidate constant cache.
3110           */
3111          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3112          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3113          /* Tile cache flush needed For CmdDipatchIndirect since command
3114           * streamer and vertex fetch aren't L3 coherent.
3115           */
3116          pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3117          break;
3118       case VK_ACCESS_2_INDEX_READ_BIT:
3119       case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
3120          /* We transitioning a buffer to be used for as input for vkCmdDraw*
3121           * commands, so we invalidate the VF cache to make sure there is no
3122           * stale data when we start rendering.
3123           */
3124          pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3125          break;
3126       case VK_ACCESS_2_UNIFORM_READ_BIT:
3127       case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
3128          /* We transitioning a buffer to be used as uniform data. Because
3129           * uniform is accessed through the data port & sampler, we need to
3130           * invalidate the texture cache (sampler) & constant cache (data
3131           * port) to avoid stale data.
3132           */
3133          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3134          if (device->physical->compiler->indirect_ubos_use_sampler) {
3135             pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3136          } else {
3137             pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3138             pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3139          }
3140          break;
3141       case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
3142       case VK_ACCESS_2_TRANSFER_READ_BIT:
3143       case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
3144          /* Transitioning a buffer to be read through the sampler, so
3145           * invalidate the texture cache, we don't want any stale data.
3146           */
3147          pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3148          break;
3149       case VK_ACCESS_2_SHADER_READ_BIT:
3150          /* Same as VK_ACCESS_2_UNIFORM_READ_BIT and
3151           * VK_ACCESS_2_SHADER_SAMPLED_READ_BIT cases above
3152           */
3153          pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
3154                       ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3155          if (!device->physical->compiler->indirect_ubos_use_sampler) {
3156             pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3157             pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3158          }
3159          break;
3160       case VK_ACCESS_2_MEMORY_READ_BIT:
3161          /* Transitioning a buffer for generic read, invalidate all the
3162           * caches.
3163           */
3164          pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3165          break;
3166       case VK_ACCESS_2_MEMORY_WRITE_BIT:
3167          /* Generic write, make sure all previously written things land in
3168           * memory.
3169           */
3170          pipe_bits |= ANV_PIPE_FLUSH_BITS;
3171          break;
3172       case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT:
3173       case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT:
3174          /* Transitioning a buffer for conditional rendering or transform
3175           * feedback. We'll load the content of this buffer into HW registers
3176           * using the command streamer, so we need to stall the command
3177           * streamer , so we need to stall the command streamer to make sure
3178           * any in-flight flush operations have completed.
3179           */
3180          pipe_bits |= ANV_PIPE_CS_STALL_BIT;
3181          pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3182          pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3183          break;
3184       case VK_ACCESS_2_HOST_READ_BIT:
3185          /* We're transitioning a buffer that was written by CPU.  Flush
3186           * all the caches.
3187           */
3188          pipe_bits |= ANV_PIPE_FLUSH_BITS;
3189          break;
3190       case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3191          /* We're transitioning a buffer to be written by the streamout fixed
3192           * function. This one is apparently not L3 coherent, so we need a
3193           * tile cache flush to make sure any previous write is not going to
3194           * create WaW hazards.
3195           */
3196          pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3197          break;
3198       case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
3199       /* VK_ACCESS_2_SHADER_STORAGE_READ_BIT specifies read access to a
3200        * storage buffer, physical storage buffer, storage texel buffer, or
3201        * storage image in any shader pipeline stage.
3202        *
3203        * Any storage buffers or images written to must be invalidated and
3204        * flushed before the shader can access them.
3205        *
3206        * Both HDC & Untyped flushes also do invalidation. This is why we use
3207        * this here on Gfx12+.
3208        *
3209        * Gfx11 and prior don't have HDC. Only Data cache flush is available
3210        * and it only operates on the written cache lines.
3211        */
3212       if (device->info->ver >= 12) {
3213          pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3214          pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3215       }
3216       break;
3217       default:
3218          break; /* Nothing to do */
3219       }
3220    }
3221 
3222    return pipe_bits;
3223 }
3224 
3225 static inline bool
stage_is_shader(const VkPipelineStageFlags2 stage)3226 stage_is_shader(const VkPipelineStageFlags2 stage)
3227 {
3228    return (stage & (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
3229                     VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
3230                     VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
3231                     VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
3232                     VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
3233                     VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
3234                     VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
3235                     VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3236                     VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
3237                     VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
3238                     VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT));
3239 }
3240 
3241 static inline bool
stage_is_transfer(const VkPipelineStageFlags2 stage)3242 stage_is_transfer(const VkPipelineStageFlags2 stage)
3243 {
3244    return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3245                     VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT));
3246 }
3247 
3248 static inline bool
stage_is_video(const VkPipelineStageFlags2 stage)3249 stage_is_video(const VkPipelineStageFlags2 stage)
3250 {
3251    return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3252 #ifdef VK_ENABLE_BETA_EXTENSIONS
3253                     VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR |
3254 #endif
3255                     VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR));
3256 }
3257 
3258 static inline bool
mask_is_shader_write(const VkAccessFlags2 access)3259 mask_is_shader_write(const VkAccessFlags2 access)
3260 {
3261    return (access & (VK_ACCESS_2_SHADER_WRITE_BIT |
3262                      VK_ACCESS_2_MEMORY_WRITE_BIT |
3263                      VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT));
3264 }
3265 
3266 static inline bool
mask_is_write(const VkAccessFlags2 access)3267 mask_is_write(const VkAccessFlags2 access)
3268 {
3269    return access & (VK_ACCESS_2_SHADER_WRITE_BIT |
3270                     VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
3271                     VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
3272                     VK_ACCESS_2_TRANSFER_WRITE_BIT |
3273                     VK_ACCESS_2_HOST_WRITE_BIT |
3274                     VK_ACCESS_2_MEMORY_WRITE_BIT |
3275                     VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
3276                     VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR |
3277 #ifdef VK_ENABLE_BETA_EXTENSIONS
3278                     VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR |
3279 #endif
3280                     VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
3281                     VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
3282                     VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV |
3283                     VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR |
3284                     VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT |
3285                     VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV);
3286 }
3287 
3288 static inline bool
mask_is_transfer_write(const VkAccessFlags2 access)3289 mask_is_transfer_write(const VkAccessFlags2 access)
3290 {
3291    return access & (VK_ACCESS_2_TRANSFER_WRITE_BIT |
3292                     VK_ACCESS_2_MEMORY_WRITE_BIT);
3293 }
3294 
3295 static void
cmd_buffer_barrier_video(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info)3296 cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
3297                         const VkDependencyInfo *dep_info)
3298 {
3299    assert(anv_cmd_buffer_is_video_queue(cmd_buffer));
3300 
3301    bool flush_llc = false;
3302    bool flush_ccs = false;
3303    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
3304       const VkImageMemoryBarrier2 *img_barrier =
3305          &dep_info->pImageMemoryBarriers[i];
3306 
3307       ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
3308       const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
3309 
3310       /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
3311        * memory barrier defines a queue family ownership transfer.
3312        */
3313       if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
3314          flush_llc = true;
3315 
3316       VkImageAspectFlags img_aspects =
3317             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
3318       anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
3319          const uint32_t plane =
3320             anv_image_aspect_to_plane(image, 1UL << aspect_bit);
3321          if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
3322             flush_ccs = true;
3323          }
3324       }
3325    }
3326 
3327    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
3328       /* Flush the cache if something is written by the video operations and
3329        * used by any other stages except video encode/decode stages or if
3330        * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
3331        * barrier defines a queue family ownership transfer.
3332        */
3333       if ((stage_is_video(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
3334            mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask) &&
3335            !stage_is_video(dep_info->pBufferMemoryBarriers[i].dstStageMask)) ||
3336           (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
3337            dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
3338          flush_llc = true;
3339          break;
3340       }
3341    }
3342 
3343    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
3344       /* Flush the cache if something is written by the video operations and
3345        * used by any other stages except video encode/decode stage.
3346        */
3347       if (stage_is_video(dep_info->pMemoryBarriers[i].srcStageMask) &&
3348           mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
3349           !stage_is_video(dep_info->pMemoryBarriers[i].dstStageMask)) {
3350          flush_llc = true;
3351          break;
3352       }
3353    }
3354 
3355    if (flush_ccs || flush_llc) {
3356       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
3357 #if GFX_VERx10 >= 125
3358          fd.FlushCCS = flush_ccs;
3359 #endif
3360 #if GFX_VER >= 12
3361          /* Using this bit on Gfx9 triggers a GPU hang.
3362           * This is undocumented behavior. Gfx12 seems fine.
3363           * TODO: check Gfx11
3364           */
3365          fd.FlushLLC = flush_llc;
3366 #endif
3367       }
3368    }
3369 }
3370 
3371 static void
cmd_buffer_barrier_blitter(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info)3372 cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
3373                            const VkDependencyInfo *dep_info)
3374 {
3375 #if GFX_VERx10 >= 125
3376    assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
3377 
3378    /* The blitter requires an MI_FLUSH_DW command when a buffer transitions
3379     * from being a destination to a source.
3380     */
3381    bool flush_llc = false;
3382    bool flush_ccs = false;
3383    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
3384       const VkImageMemoryBarrier2 *img_barrier =
3385          &dep_info->pImageMemoryBarriers[i];
3386 
3387       ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
3388       const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
3389 
3390       /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
3391        * memory barrier defines a queue family transfer operation.
3392        */
3393       if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
3394          flush_llc = true;
3395 
3396       /* Flush cache if transfer command reads the output of the previous
3397        * transfer command, ideally we should just wait for the completion but
3398        * for now just flush the cache to make the data visible.
3399        */
3400       if ((img_barrier->oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ||
3401             img_barrier->oldLayout == VK_IMAGE_LAYOUT_GENERAL) &&
3402           (img_barrier->newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL ||
3403            img_barrier->newLayout == VK_IMAGE_LAYOUT_GENERAL)) {
3404          flush_llc = true;
3405       }
3406 
3407       VkImageAspectFlags img_aspects =
3408             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
3409       anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
3410          const uint32_t plane =
3411             anv_image_aspect_to_plane(image, 1UL << aspect_bit);
3412          if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
3413             flush_ccs = true;
3414          }
3415       }
3416    }
3417 
3418    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
3419       /* Flush the cache if something is written by the transfer command and
3420        * used by any other stages except transfer stage or if
3421        * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
3422        * barrier defines a queue family transfer operation.
3423        */
3424       if ((stage_is_transfer(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
3425            mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask)) ||
3426           (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
3427            dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
3428          flush_llc = true;
3429          break;
3430       }
3431    }
3432 
3433    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
3434       /* Flush the cache if something is written by the transfer command and
3435        * used by any other stages except transfer stage.
3436        */
3437       if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
3438           mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask)) {
3439          flush_llc = true;
3440          break;
3441       }
3442    }
3443 
3444    if (flush_ccs || flush_llc) {
3445       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
3446       if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
3447          genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
3448                                                 cmd_buffer->device);
3449       }
3450       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
3451          fd.FlushCCS = flush_ccs;
3452          fd.FlushLLC = flush_llc;
3453       }
3454    }
3455 #endif
3456 }
3457 
3458 static inline bool
cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer * cmd_buffer)3459 cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer *cmd_buffer)
3460 {
3461    /* Query copies are only written with dataport, so we only need to check
3462     * that flag.
3463     */
3464    return (cmd_buffer->state.queries.buffer_write_bits &
3465            ANV_QUERY_WRITES_DATA_FLUSH) != 0;
3466 }
3467 
3468 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,const char * reason)3469 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
3470                    const VkDependencyInfo *dep_info,
3471                    const char *reason)
3472 {
3473    if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
3474       cmd_buffer_barrier_video(cmd_buffer, dep_info);
3475       return;
3476    }
3477 
3478    if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
3479       cmd_buffer_barrier_blitter(cmd_buffer, dep_info);
3480       return;
3481    }
3482 
3483    struct anv_device *device = cmd_buffer->device;
3484 
3485    /* XXX: Right now, we're really dumb and just flush whatever categories
3486     * the app asks for.  One of these days we may make this a bit better
3487     * but right now that's all the hardware allows for in most areas.
3488     */
3489    VkAccessFlags2 src_flags = 0;
3490    VkAccessFlags2 dst_flags = 0;
3491 
3492    bool apply_sparse_flushes = false;
3493    bool flush_query_copies = false;
3494 
3495    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
3496       src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
3497       dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
3498 
3499       /* Shader writes to buffers that could then be written by a transfer
3500        * command (including queries).
3501        */
3502       if (stage_is_shader(dep_info->pMemoryBarriers[i].srcStageMask) &&
3503           mask_is_shader_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
3504           stage_is_transfer(dep_info->pMemoryBarriers[i].dstStageMask)) {
3505          cmd_buffer->state.queries.buffer_write_bits |=
3506             ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
3507       }
3508 
3509       if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
3510           mask_is_transfer_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
3511           cmd_buffer_has_pending_copy_query(cmd_buffer))
3512          flush_query_copies = true;
3513 
3514       /* There's no way of knowing if this memory barrier is related to sparse
3515        * buffers! This is pretty horrible.
3516        */
3517       if (device->using_sparse && mask_is_write(src_flags))
3518          apply_sparse_flushes = true;
3519    }
3520 
3521    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
3522       const VkBufferMemoryBarrier2 *buf_barrier =
3523          &dep_info->pBufferMemoryBarriers[i];
3524       ANV_FROM_HANDLE(anv_buffer, buffer, buf_barrier->buffer);
3525 
3526       src_flags |= buf_barrier->srcAccessMask;
3527       dst_flags |= buf_barrier->dstAccessMask;
3528 
3529       /* Shader writes to buffers that could then be written by a transfer
3530        * command (including queries).
3531        */
3532       if (stage_is_shader(buf_barrier->srcStageMask) &&
3533           mask_is_shader_write(buf_barrier->srcAccessMask) &&
3534           stage_is_transfer(buf_barrier->dstStageMask)) {
3535          cmd_buffer->state.queries.buffer_write_bits |=
3536             ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
3537       }
3538 
3539       if (stage_is_transfer(buf_barrier->srcStageMask) &&
3540           mask_is_transfer_write(buf_barrier->srcAccessMask) &&
3541           cmd_buffer_has_pending_copy_query(cmd_buffer))
3542          flush_query_copies = true;
3543 
3544       if (anv_buffer_is_sparse(buffer) && mask_is_write(src_flags))
3545          apply_sparse_flushes = true;
3546    }
3547 
3548    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
3549       const VkImageMemoryBarrier2 *img_barrier =
3550          &dep_info->pImageMemoryBarriers[i];
3551 
3552       src_flags |= img_barrier->srcAccessMask;
3553       dst_flags |= img_barrier->dstAccessMask;
3554 
3555       ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
3556       const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
3557 
3558       uint32_t base_layer, layer_count;
3559       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
3560          base_layer = 0;
3561          layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
3562       } else {
3563          base_layer = range->baseArrayLayer;
3564          layer_count = vk_image_subresource_layer_count(&image->vk, range);
3565       }
3566       const uint32_t level_count =
3567          vk_image_subresource_level_count(&image->vk, range);
3568 
3569       VkImageLayout old_layout = img_barrier->oldLayout;
3570       VkImageLayout new_layout = img_barrier->newLayout;
3571 
3572       /* If we're inside a render pass, the runtime might have converted some
3573        * layouts from GENERAL to FEEDBACK_LOOP. Check if that's the case and
3574        * reconvert back to the original layout so that application barriers
3575        * within renderpass are operating with consistent layouts.
3576        */
3577       if (!cmd_buffer->vk.runtime_rp_barrier &&
3578           cmd_buffer->vk.render_pass != NULL) {
3579          assert(anv_cmd_graphics_state_has_image_as_attachment(&cmd_buffer->state.gfx,
3580                                                                image));
3581          VkImageLayout subpass_att_layout, subpass_stencil_att_layout;
3582 
3583          vk_command_buffer_get_attachment_layout(
3584             &cmd_buffer->vk, &image->vk,
3585             &subpass_att_layout, &subpass_stencil_att_layout);
3586 
3587          old_layout = subpass_att_layout;
3588          new_layout = subpass_att_layout;
3589       }
3590 
3591       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3592          transition_depth_buffer(cmd_buffer, image,
3593                                  base_layer, layer_count,
3594                                  old_layout, new_layout,
3595                                  false /* will_full_fast_clear */);
3596       }
3597 
3598       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3599          transition_stencil_buffer(cmd_buffer, image,
3600                                    range->baseMipLevel, level_count,
3601                                    base_layer, layer_count,
3602                                    old_layout, new_layout,
3603                                    false /* will_full_fast_clear */);
3604       }
3605 
3606       if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
3607          VkImageAspectFlags color_aspects =
3608             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
3609          anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
3610             transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
3611                                     range->baseMipLevel, level_count,
3612                                     base_layer, layer_count,
3613                                     old_layout, new_layout,
3614                                     img_barrier->srcQueueFamilyIndex,
3615                                     img_barrier->dstQueueFamilyIndex,
3616                                     false /* will_full_fast_clear */);
3617          }
3618       }
3619 
3620       /* Mark image as compressed if the destination layout has untracked
3621        * writes to the aux surface.
3622        */
3623       VkImageAspectFlags aspects =
3624          vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
3625       anv_foreach_image_aspect_bit(aspect_bit, image, aspects) {
3626          VkImageAspectFlagBits aspect = 1UL << aspect_bit;
3627          if (anv_layout_has_untracked_aux_writes(
3628                 device->info,
3629                 image, aspect,
3630                 img_barrier->newLayout,
3631                 cmd_buffer->queue_family->queueFlags)) {
3632             for (uint32_t l = 0; l < level_count; l++) {
3633                set_image_compressed_bit(cmd_buffer, image, aspect,
3634                                         range->baseMipLevel + l,
3635                                         base_layer, layer_count,
3636                                         true);
3637             }
3638          }
3639       }
3640 
3641       if (anv_image_is_sparse(image) && mask_is_write(src_flags))
3642          apply_sparse_flushes = true;
3643    }
3644 
3645    enum anv_pipe_bits bits =
3646       anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags) |
3647       anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags);
3648 
3649    /* Our HW implementation of the sparse feature lives in the GAM unit
3650     * (interface between all the GPU caches and external memory). As a result
3651     * writes to NULL bound images & buffers that should be ignored are
3652     * actually still visible in the caches. The only way for us to get correct
3653     * NULL bound regions to return 0s is to evict the caches to force the
3654     * caches to be repopulated with 0s.
3655     */
3656    if (apply_sparse_flushes)
3657       bits |= ANV_PIPE_FLUSH_BITS;
3658 
3659    /* Copies from query pools are executed with a shader writing through the
3660     * dataport.
3661     */
3662    if (flush_query_copies) {
3663       bits |= (GFX_VER >= 12 ?
3664                ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
3665    }
3666 
3667    if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
3668       genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
3669 
3670    anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
3671 }
3672 
genX(CmdPipelineBarrier2)3673 void genX(CmdPipelineBarrier2)(
3674     VkCommandBuffer                             commandBuffer,
3675     const VkDependencyInfo*                     pDependencyInfo)
3676 {
3677    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3678 
3679    cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
3680 }
3681 
3682 void
genX(batch_emit_breakpoint)3683 genX(batch_emit_breakpoint)(struct anv_batch *batch,
3684                             struct anv_device *device,
3685                             bool emit_before_draw)
3686 {
3687    /* Update draw call count once */
3688    uint32_t draw_count = emit_before_draw ?
3689                          p_atomic_inc_return(&device->draw_call_count) :
3690                          p_atomic_read(&device->draw_call_count);
3691 
3692    if (((draw_count == intel_debug_bkp_before_draw_count &&
3693         emit_before_draw) ||
3694        (draw_count == intel_debug_bkp_after_draw_count &&
3695         !emit_before_draw))) {
3696       struct anv_address wait_addr =
3697          anv_state_pool_state_address(&device->dynamic_state_pool,
3698                                       device->breakpoint);
3699 
3700       anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
3701          sem.WaitMode            = PollingMode;
3702          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
3703          sem.SemaphoreDataDword  = 0x1;
3704          sem.SemaphoreAddress    = wait_addr;
3705       };
3706    }
3707 }
3708 
3709 /* Only emit PIPELINE_SELECT, for the whole mode switch and flushing use
3710  * flush_pipeline_select()
3711  */
3712 void
genX(emit_pipeline_select)3713 genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
3714                            const struct anv_device *device)
3715 {
3716    /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
3717 #if GFX_VER < 20
3718    anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) {
3719       ps.MaskBits = GFX_VERx10 >= 125 ? 0x93 : GFX_VER >= 12 ? 0x13 : 0x3;
3720 #if GFX_VER == 12
3721       ps.MediaSamplerDOPClockGateEnable = true;
3722 #endif
3723       ps.PipelineSelection = pipeline;
3724 #if GFX_VERx10 == 125
3725       /* It might still be better to only enable this when the compute
3726        * pipeline will have DPAS instructions.
3727        */
3728       ps.SystolicModeEnable = pipeline == GPGPU &&
3729          device->vk.enabled_extensions.KHR_cooperative_matrix &&
3730          device->vk.enabled_features.cooperativeMatrix;
3731 #endif
3732    }
3733 #endif /* if GFX_VER < 20 */
3734 }
3735 
3736 static void
genX(flush_pipeline_select)3737 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
3738                             uint32_t pipeline)
3739 {
3740    UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
3741 
3742    if (cmd_buffer->state.current_pipeline == pipeline)
3743       return;
3744 
3745 #if GFX_VER >= 20
3746    /* Since we are not stalling/flushing caches explicitly while switching
3747     * between the pipelines, we need to apply data dependency flushes recorded
3748     * previously on the resource.
3749     */
3750    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3751 #else
3752 
3753 #if GFX_VER == 9
3754    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
3755     *
3756     *   Software must clear the COLOR_CALC_STATE Valid field in
3757     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
3758     *   with Pipeline Select set to GPGPU.
3759     *
3760     * The internal hardware docs recommend the same workaround for Gfx9
3761     * hardware too.
3762     */
3763    if (pipeline == GPGPU)
3764       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
3765 #endif
3766 
3767 #if GFX_VERx10 == 120
3768    /* Undocumented workaround to force the re-emission of
3769     * MEDIA_INTERFACE_DESCRIPTOR_LOAD when switching from 3D to Compute
3770     * pipeline without rebinding a pipeline :
3771     *    vkCmdBindPipeline(COMPUTE, cs_pipeline);
3772     *    vkCmdDispatch(...);
3773     *    vkCmdBindPipeline(GRAPHICS, gfx_pipeline);
3774     *    vkCmdDraw(...);
3775     *    vkCmdDispatch(...);
3776     */
3777    if (pipeline == _3D)
3778       cmd_buffer->state.compute.pipeline_dirty = true;
3779 #endif
3780 
3781    /* We apparently cannot flush the tile cache (color/depth) from the GPGPU
3782     * pipeline. That means query clears will not be visible to query
3783     * copy/write. So we need to flush it before going to GPGPU mode.
3784     */
3785    if (cmd_buffer->state.current_pipeline == _3D &&
3786        cmd_buffer->state.queries.clear_bits) {
3787       anv_add_pending_pipe_bits(cmd_buffer,
3788                                 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
3789                                 "query clear flush prior to GPGPU");
3790    }
3791 
3792    /* Flush and invalidate bits done needed prior PIPELINE_SELECT. */
3793    enum anv_pipe_bits bits = 0;
3794 
3795 #if GFX_VER >= 12
3796    /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
3797     *
3798     *   "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
3799     *   are flushed through a stalling PIPE_CONTROL command prior to
3800     *   programming of PIPELINE_SELECT command transitioning Pipeline Select
3801     *   from 3D to GPGPU/Media.
3802     *   Software must ensure HDC Pipeline flush and Generic Media State Clear
3803     *   is issued through a stalling PIPE_CONTROL command prior to programming
3804     *   of PIPELINE_SELECT command transitioning Pipeline Select from
3805     *   GPGPU/Media to 3D."
3806     *
3807     * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
3808     * because PIPE was not in MEDIA mode?!
3809     */
3810    bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3811 
3812    if (cmd_buffer->state.current_pipeline == _3D) {
3813       bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3814               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3815    } else {
3816       bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3817    }
3818 #else
3819    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
3820     * PIPELINE_SELECT [DevBWR+]":
3821     *
3822     *   Project: DEVSNB+
3823     *
3824     *   Software must ensure all the write caches are flushed through a
3825     *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
3826     *   command to invalidate read only caches prior to programming
3827     *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
3828     *
3829     * Note the cmd_buffer_apply_pipe_flushes will split this into two
3830     * PIPE_CONTROLs.
3831     */
3832    bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3833            ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
3834            ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
3835            ANV_PIPE_CS_STALL_BIT |
3836            ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
3837            ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
3838            ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
3839            ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
3840            ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3841 #endif
3842 
3843    /* Wa_16013063087 -  State Cache Invalidate must be issued prior to
3844     * PIPELINE_SELECT when switching from 3D to Compute.
3845     *
3846     * SW must do this by programming of PIPECONTROL with “CS Stall” followed by
3847     * a PIPECONTROL with State Cache Invalidate bit set.
3848     *
3849     */
3850    if (cmd_buffer->state.current_pipeline == _3D && pipeline == GPGPU &&
3851        intel_needs_workaround(cmd_buffer->device->info, 16013063087))
3852       bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
3853 
3854    anv_add_pending_pipe_bits(cmd_buffer, bits, "flush/invalidate PIPELINE_SELECT");
3855    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3856 
3857 #if GFX_VER == 9
3858    if (pipeline == _3D) {
3859       /* There is a mid-object preemption workaround which requires you to
3860        * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D.  However,
3861        * even without preemption, we have issues with geometry flickering when
3862        * GPGPU and 3D are back-to-back and this seems to fix it.  We don't
3863        * really know why.
3864        *
3865        * Also, from the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
3866        *
3867        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
3868        *    the only bits that are changed are scoreboard related ..."
3869        *
3870        * This is satisfied by applying pre-PIPELINE_SELECT pipe flushes above.
3871        */
3872       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
3873          vfe.MaximumNumberofThreads =
3874             devinfo->max_cs_threads * devinfo->subslice_total - 1;
3875          vfe.NumberofURBEntries     = 2;
3876          vfe.URBEntryAllocationSize = 2;
3877       }
3878 
3879       /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
3880        * invalid. Set the compute pipeline to dirty to force a re-emit of the
3881        * pipeline in case we get back-to-back dispatch calls with the same
3882        * pipeline and a PIPELINE_SELECT in between.
3883        */
3884       cmd_buffer->state.compute.pipeline_dirty = true;
3885    }
3886 #endif
3887 
3888    genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline, cmd_buffer->device);
3889 
3890 #if GFX_VER == 9
3891    if (devinfo->platform == INTEL_PLATFORM_GLK) {
3892       /* Project: DevGLK
3893        *
3894        * "This chicken bit works around a hardware issue with barrier logic
3895        *  encountered when switching between GPGPU and 3D pipelines.  To
3896        *  workaround the issue, this mode bit should be set after a pipeline
3897        *  is selected."
3898        */
3899       anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
3900          scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
3901                                                   : GLK_BARRIER_MODE_3D_HULL;
3902          scec1.GLKBarrierModeMask = 1;
3903       }
3904    }
3905 #endif
3906 #endif /* else of if GFX_VER >= 20 */
3907    cmd_buffer->state.current_pipeline = pipeline;
3908 }
3909 
3910 void
genX(flush_pipeline_select_3d)3911 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
3912 {
3913    genX(flush_pipeline_select)(cmd_buffer, _3D);
3914 }
3915 
3916 void
genX(flush_pipeline_select_gpgpu)3917 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
3918 {
3919    genX(flush_pipeline_select)(cmd_buffer, GPGPU);
3920 }
3921 
3922 void
genX(cmd_buffer_emit_gfx12_depth_wa)3923 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
3924                                      const struct isl_surf *surf)
3925 {
3926 #if INTEL_NEEDS_WA_1808121037
3927    const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
3928                                surf->samples == 1;
3929 
3930    switch (cmd_buffer->state.depth_reg_mode) {
3931    case ANV_DEPTH_REG_MODE_HW_DEFAULT:
3932       if (!is_d16_1x_msaa)
3933          return;
3934       break;
3935    case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
3936       if (is_d16_1x_msaa)
3937          return;
3938       break;
3939    case ANV_DEPTH_REG_MODE_UNKNOWN:
3940       break;
3941    }
3942 
3943    /* We'll change some CHICKEN registers depending on the depth surface
3944     * format. Do a depth flush and stall so the pipeline is not using these
3945     * settings while we change the registers.
3946     */
3947    anv_add_pending_pipe_bits(cmd_buffer,
3948                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
3949                              ANV_PIPE_DEPTH_STALL_BIT |
3950                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
3951                              "Workaround: Stop pipeline for 1808121037");
3952    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3953 
3954    /* Wa_1808121037
3955     *
3956     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
3957     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
3958     */
3959    anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
3960       reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
3961       reg.HIZPlaneOptimizationdisablebitMask = true;
3962    }
3963 
3964    cmd_buffer->state.depth_reg_mode =
3965       is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
3966                        ANV_DEPTH_REG_MODE_HW_DEFAULT;
3967 #endif
3968 }
3969 
3970 #if GFX_VER == 9
3971 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
3972  *
3973  *    "The VF cache needs to be invalidated before binding and then using
3974  *    Vertex Buffers that overlap with any previously bound Vertex Buffer
3975  *    (at a 64B granularity) since the last invalidation.  A VF cache
3976  *    invalidate is performed by setting the "VF Cache Invalidation Enable"
3977  *    bit in PIPE_CONTROL."
3978  *
3979  * This is implemented by carefully tracking all vertex and index buffer
3980  * bindings and flushing if the cache ever ends up with a range in the cache
3981  * that would exceed 4 GiB.  This is implemented in three parts:
3982  *
3983  *    1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
3984  *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
3985  *       tracking code of the new binding.  If this new binding would cause
3986  *       the cache to have a too-large range on the next draw call, a pipeline
3987  *       stall and VF cache invalidate are added to pending_pipeline_bits.
3988  *
3989  *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
3990  *       empty whenever we emit a VF invalidate.
3991  *
3992  *    3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
3993  *       after every 3DPRIMITIVE and copies the bound range into the dirty
3994  *       range for each used buffer.  This has to be a separate step because
3995  *       we don't always re-bind all buffers and so 1. can't know which
3996  *       buffers are actually bound.
3997  */
3998 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)3999 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4000                                                int vb_index,
4001                                                struct anv_address vb_address,
4002                                                uint32_t vb_size)
4003 {
4004    if (GFX_VER > 9)
4005       return;
4006 
4007    struct anv_vb_cache_range *bound, *dirty;
4008    if (vb_index == -1) {
4009       bound = &cmd_buffer->state.gfx.ib_bound_range;
4010       dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4011    } else {
4012       assert(vb_index >= 0);
4013       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4014       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4015       bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4016       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4017    }
4018 
4019    if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4020                                                   vb_address,
4021                                                   vb_size)) {
4022       anv_add_pending_pipe_bits(cmd_buffer,
4023                                 ANV_PIPE_CS_STALL_BIT |
4024                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4025                                 "vb > 32b range");
4026    }
4027 }
4028 
4029 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4030 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4031                                                     uint32_t access_type,
4032                                                     uint64_t vb_used)
4033 {
4034    if (access_type == RANDOM) {
4035       /* We have an index buffer */
4036       struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4037       struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4038 
4039       anv_merge_vb_cache_range(dirty, bound);
4040    }
4041 
4042    uint64_t mask = vb_used;
4043    while (mask) {
4044       int i = u_bit_scan64(&mask);
4045       assert(i >= 0);
4046       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4047       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4048 
4049       struct anv_vb_cache_range *bound, *dirty;
4050       bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4051       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4052 
4053       anv_merge_vb_cache_range(dirty, bound);
4054    }
4055 }
4056 #endif /* GFX_VER == 9 */
4057 
4058 /**
4059  * Update the pixel hashing modes that determine the balancing of PS threads
4060  * across subslices and slices.
4061  *
4062  * \param width Width bound of the rendering area (already scaled down if \p
4063  *              scale is greater than 1).
4064  * \param height Height bound of the rendering area (already scaled down if \p
4065  *               scale is greater than 1).
4066  * \param scale The number of framebuffer samples that could potentially be
4067  *              affected by an individual channel of the PS thread.  This is
4068  *              typically one for single-sampled rendering, but for operations
4069  *              like CCS resolves and fast clears a single PS invocation may
4070  *              update a huge number of pixels, in which case a finer
4071  *              balancing is desirable in order to maximally utilize the
4072  *              bandwidth available.  UINT_MAX can be used as shorthand for
4073  *              "finest hashing mode available".
4074  */
4075 void
genX(cmd_buffer_emit_hashing_mode)4076 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
4077                                    unsigned width, unsigned height,
4078                                    unsigned scale)
4079 {
4080 #if GFX_VER == 9
4081    const struct intel_device_info *devinfo = cmd_buffer->device->info;
4082    const unsigned slice_hashing[] = {
4083       /* Because all Gfx9 platforms with more than one slice require
4084        * three-way subslice hashing, a single "normal" 16x16 slice hashing
4085        * block is guaranteed to suffer from substantial imbalance, with one
4086        * subslice receiving twice as much work as the other two in the
4087        * slice.
4088        *
4089        * The performance impact of that would be particularly severe when
4090        * three-way hashing is also in use for slice balancing (which is the
4091        * case for all Gfx9 GT4 platforms), because one of the slices
4092        * receives one every three 16x16 blocks in either direction, which
4093        * is roughly the periodicity of the underlying subslice imbalance
4094        * pattern ("roughly" because in reality the hardware's
4095        * implementation of three-way hashing doesn't do exact modulo 3
4096        * arithmetic, which somewhat decreases the magnitude of this effect
4097        * in practice).  This leads to a systematic subslice imbalance
4098        * within that slice regardless of the size of the primitive.  The
4099        * 32x32 hashing mode guarantees that the subslice imbalance within a
4100        * single slice hashing block is minimal, largely eliminating this
4101        * effect.
4102        */
4103       _32x32,
4104       /* Finest slice hashing mode available. */
4105       NORMAL
4106    };
4107    const unsigned subslice_hashing[] = {
4108       /* 16x16 would provide a slight cache locality benefit especially
4109        * visible in the sampler L1 cache efficiency of low-bandwidth
4110        * non-LLC platforms, but it comes at the cost of greater subslice
4111        * imbalance for primitives of dimensions approximately intermediate
4112        * between 16x4 and 16x16.
4113        */
4114       _16x4,
4115       /* Finest subslice hashing mode available. */
4116       _8x4
4117    };
4118    /* Dimensions of the smallest hashing block of a given hashing mode.  If
4119     * the rendering area is smaller than this there can't possibly be any
4120     * benefit from switching to this mode, so we optimize out the
4121     * transition.
4122     */
4123    const unsigned min_size[][2] = {
4124          { 16, 4 },
4125          { 8, 4 }
4126    };
4127    const unsigned idx = scale > 1;
4128 
4129    if (cmd_buffer->state.current_hash_scale != scale &&
4130        (width > min_size[idx][0] || height > min_size[idx][1])) {
4131       anv_add_pending_pipe_bits(cmd_buffer,
4132                                 ANV_PIPE_CS_STALL_BIT |
4133                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
4134                                 "change pixel hash mode");
4135       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4136 
4137       anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
4138          gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
4139          gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
4140          gt.SubsliceHashing = subslice_hashing[idx];
4141          gt.SubsliceHashingMask = -1;
4142       }
4143 
4144       cmd_buffer->state.current_hash_scale = scale;
4145    }
4146 #endif
4147 }
4148 
4149 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)4150 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
4151 {
4152    struct anv_device *device = cmd_buffer->device;
4153    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4154 
4155    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4156                                         device->isl_dev.ds.size / 4);
4157    if (dw == NULL)
4158       return;
4159 
4160    struct isl_view isl_view = {};
4161    struct isl_depth_stencil_hiz_emit_info info = {
4162       .view = &isl_view,
4163       .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
4164    };
4165 
4166    if (gfx->depth_att.iview != NULL) {
4167       isl_view = gfx->depth_att.iview->planes[0].isl;
4168    } else if (gfx->stencil_att.iview != NULL) {
4169       isl_view = gfx->stencil_att.iview->planes[0].isl;
4170    }
4171 
4172    if (gfx->view_mask) {
4173       assert(isl_view.array_len == 0 ||
4174              isl_view.array_len >= util_last_bit(gfx->view_mask));
4175       isl_view.array_len = util_last_bit(gfx->view_mask);
4176    } else {
4177       assert(isl_view.array_len == 0 ||
4178              isl_view.array_len >= util_last_bit(gfx->layer_count));
4179       isl_view.array_len = gfx->layer_count;
4180    }
4181 
4182    if (gfx->depth_att.iview != NULL) {
4183       const struct anv_image_view *iview = gfx->depth_att.iview;
4184       const struct anv_image *image = iview->image;
4185 
4186       const uint32_t depth_plane =
4187          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
4188       const struct anv_surface *depth_surface =
4189          &image->planes[depth_plane].primary_surface;
4190       const struct anv_address depth_address =
4191          anv_image_address(image, &depth_surface->memory_range);
4192 
4193       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, depth_address.bo);
4194 
4195       info.depth_surf = &depth_surface->isl;
4196       info.depth_address = anv_address_physical(depth_address);
4197       info.mocs =
4198          anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
4199 
4200       info.hiz_usage = gfx->depth_att.aux_usage;
4201       if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
4202          assert(isl_aux_usage_has_hiz(info.hiz_usage));
4203 
4204          const struct anv_surface *hiz_surface =
4205             &image->planes[depth_plane].aux_surface;
4206          const struct anv_address hiz_address =
4207             anv_image_address(image, &hiz_surface->memory_range);
4208 
4209          anv_reloc_list_add_bo(cmd_buffer->batch.relocs, hiz_address.bo);
4210 
4211          info.hiz_surf = &hiz_surface->isl;
4212          info.hiz_address = anv_address_physical(hiz_address);
4213 
4214          info.depth_clear_value = ANV_HZ_FC_VAL;
4215       }
4216    }
4217 
4218    if (gfx->stencil_att.iview != NULL) {
4219       const struct anv_image_view *iview = gfx->stencil_att.iview;
4220       const struct anv_image *image = iview->image;
4221 
4222       const uint32_t stencil_plane =
4223          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
4224       const struct anv_surface *stencil_surface =
4225          &image->planes[stencil_plane].primary_surface;
4226       const struct anv_address stencil_address =
4227          anv_image_address(image, &stencil_surface->memory_range);
4228 
4229       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, stencil_address.bo);
4230 
4231       info.stencil_surf = &stencil_surface->isl;
4232 
4233       info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
4234       info.stencil_address = anv_address_physical(stencil_address);
4235       info.mocs =
4236          anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
4237    }
4238 
4239    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
4240 
4241    /* Wa_14016712196:
4242     * Emit depth flush after state that sends implicit depth flush.
4243     */
4244    if (intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
4245       genx_batch_emit_pipe_control(&cmd_buffer->batch,
4246                                    cmd_buffer->device->info,
4247                                    cmd_buffer->state.current_pipeline,
4248                                    ANV_PIPE_DEPTH_CACHE_FLUSH_BIT);
4249    }
4250 
4251    if (info.depth_surf)
4252       genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
4253 
4254    if (GFX_VER >= 11) {
4255       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
4256       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4257 
4258       if (intel_needs_workaround(cmd_buffer->device->info, 1408224581) ||
4259           intel_needs_workaround(cmd_buffer->device->info, 14014097488)) {
4260          /* Wa_1408224581
4261           *
4262           * Workaround: Gfx12LP Astep only An additional pipe control with
4263           * post-sync = store dword operation would be required.( w/a is to
4264           * have an additional pipe control after the stencil state whenever
4265           * the surface state bits of this state is changing).
4266           *
4267           * This also seems sufficient to handle Wa_14014097488.
4268           */
4269          genx_batch_emit_pipe_control_write
4270             (&cmd_buffer->batch, cmd_buffer->device->info,
4271              cmd_buffer->state.current_pipeline, WriteImmediateData,
4272              cmd_buffer->device->workaround_address, 0, 0);
4273       }
4274    }
4275    cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
4276 }
4277 
4278 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image_view * fsr_iview)4279 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
4280                                    const struct anv_image_view *fsr_iview)
4281 {
4282 #if GFX_VERx10 >= 125
4283    struct anv_device *device = cmd_buffer->device;
4284 
4285    if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
4286       return;
4287 
4288    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4289                                         device->isl_dev.cpb.size / 4);
4290    if (dw == NULL)
4291       return;
4292 
4293    struct isl_cpb_emit_info info = { };
4294 
4295    if (fsr_iview) {
4296       const struct anv_image_binding *binding = &fsr_iview->image->bindings[0];
4297 
4298       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, binding->address.bo);
4299 
4300       struct anv_address addr =
4301          anv_address_add(binding->address, binding->memory_range.offset);
4302 
4303       info.view = &fsr_iview->planes[0].isl;
4304       info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
4305       info.address = anv_address_physical(addr);
4306       info.mocs =
4307          anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
4308                   ISL_SURF_USAGE_CPB_BIT);
4309    }
4310 
4311    isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
4312 
4313    /* Wa_14016712196:
4314     * Emit depth flush after state that sends implicit depth flush.
4315     */
4316    if (intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
4317       genx_batch_emit_pipe_control(&cmd_buffer->batch,
4318                                    cmd_buffer->device->info,
4319                                    cmd_buffer->state.current_pipeline,
4320                                    ANV_PIPE_DEPTH_CACHE_FLUSH_BIT);
4321    }
4322 #endif /* GFX_VERx10 >= 125 */
4323 }
4324 
4325 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)4326 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
4327 {
4328    const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
4329       vk_find_struct_const(att->pNext,
4330                            RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
4331    if (layout_info != NULL)
4332       return layout_info->initialLayout;
4333 
4334    return att->imageLayout;
4335 }
4336 
genX(CmdBeginRendering)4337 void genX(CmdBeginRendering)(
4338     VkCommandBuffer                             commandBuffer,
4339     const VkRenderingInfo*                      pRenderingInfo)
4340 {
4341    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4342    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4343    VkResult result;
4344 
4345    if (!anv_cmd_buffer_is_render_queue(cmd_buffer)) {
4346       assert(!"Trying to start a render pass on non-render queue!");
4347       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
4348       return;
4349    }
4350 
4351    anv_measure_beginrenderpass(cmd_buffer);
4352    trace_intel_begin_render_pass(&cmd_buffer->trace);
4353 
4354    gfx->rendering_flags = pRenderingInfo->flags;
4355    gfx->view_mask = pRenderingInfo->viewMask;
4356    gfx->layer_count = pRenderingInfo->layerCount;
4357    gfx->samples = 0;
4358 
4359    if (gfx->render_area.offset.x != pRenderingInfo->renderArea.offset.x ||
4360        gfx->render_area.offset.y != pRenderingInfo->renderArea.offset.y ||
4361        gfx->render_area.extent.width != pRenderingInfo->renderArea.extent.width ||
4362        gfx->render_area.extent.height != pRenderingInfo->renderArea.extent.height) {
4363       gfx->render_area = pRenderingInfo->renderArea;
4364       gfx->dirty |= ANV_CMD_DIRTY_RENDER_AREA;
4365    }
4366 
4367    const bool is_multiview = gfx->view_mask != 0;
4368    const VkRect2D render_area = gfx->render_area;
4369    const uint32_t layers =
4370       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
4371 
4372    /* The framebuffer size is at least large enough to contain the render
4373     * area.  Because a zero renderArea is possible, we MAX with 1.
4374     */
4375    struct isl_extent3d fb_size = {
4376       .w = MAX2(1, render_area.offset.x + render_area.extent.width),
4377       .h = MAX2(1, render_area.offset.y + render_area.extent.height),
4378       .d = layers,
4379    };
4380 
4381    const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
4382    result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
4383    if (result != VK_SUCCESS)
4384       return;
4385 
4386    genX(flush_pipeline_select_3d)(cmd_buffer);
4387 
4388    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
4389       if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
4390          continue;
4391 
4392       const VkRenderingAttachmentInfo *att =
4393          &pRenderingInfo->pColorAttachments[i];
4394       ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
4395       const VkImageLayout initial_layout = attachment_initial_layout(att);
4396 
4397       assert(render_area.offset.x + render_area.extent.width <=
4398              iview->vk.extent.width);
4399       assert(render_area.offset.y + render_area.extent.height <=
4400              iview->vk.extent.height);
4401       assert(layers <= iview->vk.layer_count);
4402 
4403       fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
4404       fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
4405 
4406       assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
4407       gfx->samples |= iview->vk.image->samples;
4408 
4409       enum isl_aux_usage aux_usage =
4410          anv_layout_to_aux_usage(cmd_buffer->device->info,
4411                                  iview->image,
4412                                  VK_IMAGE_ASPECT_COLOR_BIT,
4413                                  VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
4414                                  att->imageLayout,
4415                                  cmd_buffer->queue_family->queueFlags);
4416 
4417       union isl_color_value fast_clear_color = { .u32 = { 0, } };
4418 
4419       if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
4420           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
4421          const union isl_color_value clear_color =
4422             vk_to_isl_color_with_format(att->clearValue.color,
4423                                         iview->planes[0].isl.format);
4424 
4425          /* We only support fast-clears on the first layer */
4426          const bool fast_clear =
4427             (!is_multiview || (gfx->view_mask & 1)) &&
4428             anv_can_fast_clear_color_view(cmd_buffer->device, iview,
4429                                           att->imageLayout, clear_color,
4430                                           layers, render_area,
4431                                           cmd_buffer->queue_family->queueFlags);
4432 
4433          if (att->imageLayout != initial_layout) {
4434             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
4435                    render_area.extent.width == iview->vk.extent.width &&
4436                    render_area.extent.height == iview->vk.extent.height);
4437             if (is_multiview) {
4438                u_foreach_bit(view, gfx->view_mask) {
4439                   transition_color_buffer(cmd_buffer, iview->image,
4440                                           VK_IMAGE_ASPECT_COLOR_BIT,
4441                                           iview->vk.base_mip_level, 1,
4442                                           iview->vk.base_array_layer + view,
4443                                           1, /* layer_count */
4444                                           initial_layout, att->imageLayout,
4445                                           VK_QUEUE_FAMILY_IGNORED,
4446                                           VK_QUEUE_FAMILY_IGNORED,
4447                                           fast_clear);
4448                }
4449             } else {
4450                transition_color_buffer(cmd_buffer, iview->image,
4451                                        VK_IMAGE_ASPECT_COLOR_BIT,
4452                                        iview->vk.base_mip_level, 1,
4453                                        iview->vk.base_array_layer,
4454                                        gfx->layer_count,
4455                                        initial_layout, att->imageLayout,
4456                                        VK_QUEUE_FAMILY_IGNORED,
4457                                        VK_QUEUE_FAMILY_IGNORED,
4458                                        fast_clear);
4459             }
4460          }
4461 
4462          uint32_t clear_view_mask = pRenderingInfo->viewMask;
4463          uint32_t base_clear_layer = iview->vk.base_array_layer;
4464          uint32_t clear_layer_count = gfx->layer_count;
4465          if (fast_clear) {
4466             /* We only support fast-clears on the first layer */
4467             assert(iview->vk.base_mip_level == 0 &&
4468                    iview->vk.base_array_layer == 0);
4469 
4470             fast_clear_color = clear_color;
4471 
4472             if (iview->image->vk.samples == 1) {
4473                anv_image_ccs_op(cmd_buffer, iview->image,
4474                                 iview->planes[0].isl.format,
4475                                 iview->planes[0].isl.swizzle,
4476                                 VK_IMAGE_ASPECT_COLOR_BIT,
4477                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
4478                                 &fast_clear_color,
4479                                 false);
4480             } else {
4481                anv_image_mcs_op(cmd_buffer, iview->image,
4482                                 iview->planes[0].isl.format,
4483                                 iview->planes[0].isl.swizzle,
4484                                 VK_IMAGE_ASPECT_COLOR_BIT,
4485                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
4486                                 &fast_clear_color,
4487                                 false);
4488             }
4489             clear_view_mask &= ~1u;
4490             base_clear_layer++;
4491             clear_layer_count--;
4492 
4493             genX(set_fast_clear_state)(cmd_buffer, iview->image,
4494                                        iview->planes[0].isl.format,
4495                                        clear_color);
4496          }
4497 
4498          if (is_multiview) {
4499             u_foreach_bit(view, clear_view_mask) {
4500                anv_image_clear_color(cmd_buffer, iview->image,
4501                                      VK_IMAGE_ASPECT_COLOR_BIT,
4502                                      aux_usage,
4503                                      iview->planes[0].isl.format,
4504                                      iview->planes[0].isl.swizzle,
4505                                      iview->vk.base_mip_level,
4506                                      iview->vk.base_array_layer + view, 1,
4507                                      render_area, clear_color);
4508             }
4509          } else {
4510             anv_image_clear_color(cmd_buffer, iview->image,
4511                                   VK_IMAGE_ASPECT_COLOR_BIT,
4512                                   aux_usage,
4513                                   iview->planes[0].isl.format,
4514                                   iview->planes[0].isl.swizzle,
4515                                   iview->vk.base_mip_level,
4516                                   base_clear_layer, clear_layer_count,
4517                                   render_area, clear_color);
4518          }
4519       } else {
4520          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
4521          assert(att->imageLayout == initial_layout);
4522       }
4523 
4524       gfx->color_att[i].vk_format = iview->vk.format;
4525       gfx->color_att[i].iview = iview;
4526       gfx->color_att[i].layout = att->imageLayout;
4527       gfx->color_att[i].aux_usage = aux_usage;
4528 
4529       struct isl_view isl_view = iview->planes[0].isl;
4530       if (pRenderingInfo->viewMask) {
4531          assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
4532          isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
4533       } else {
4534          assert(isl_view.array_len >= pRenderingInfo->layerCount);
4535          isl_view.array_len = pRenderingInfo->layerCount;
4536       }
4537 
4538       anv_image_fill_surface_state(cmd_buffer->device,
4539                                    iview->image,
4540                                    VK_IMAGE_ASPECT_COLOR_BIT,
4541                                    &isl_view,
4542                                    ISL_SURF_USAGE_RENDER_TARGET_BIT,
4543                                    aux_usage, &fast_clear_color,
4544                                    0, /* anv_image_view_state_flags */
4545                                    &gfx->color_att[i].surface_state);
4546 
4547       add_surface_state_relocs(cmd_buffer, &gfx->color_att[i].surface_state);
4548 
4549       if (GFX_VER < 10 &&
4550           (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
4551            (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
4552           iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
4553           iview->planes[0].isl.base_level == 0 &&
4554           iview->planes[0].isl.base_array_layer == 0) {
4555          genX(load_image_clear_color)(cmd_buffer,
4556                                       gfx->color_att[i].surface_state.state,
4557                                       iview->image);
4558       }
4559 
4560       if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
4561          gfx->color_att[i].resolve_mode = att->resolveMode;
4562          gfx->color_att[i].resolve_iview =
4563             anv_image_view_from_handle(att->resolveImageView);
4564          gfx->color_att[i].resolve_layout = att->resolveImageLayout;
4565       }
4566    }
4567 
4568    anv_cmd_graphic_state_update_has_uint_rt(gfx);
4569 
4570    const struct anv_image_view *fsr_iview = NULL;
4571    const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
4572       vk_find_struct_const(pRenderingInfo->pNext,
4573                            RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
4574    if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
4575       fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
4576       /* imageLayout and shadingRateAttachmentTexelSize are ignored */
4577    }
4578 
4579    const struct anv_image_view *ds_iview = NULL;
4580    const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
4581    const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
4582    if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
4583        (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
4584       const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
4585       VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
4586       VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
4587       VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
4588       VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
4589       enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
4590       enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
4591       float depth_clear_value = 0;
4592       uint32_t stencil_clear_value = 0;
4593 
4594       if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
4595          d_iview = anv_image_view_from_handle(d_att->imageView);
4596          initial_depth_layout = attachment_initial_layout(d_att);
4597          depth_layout = d_att->imageLayout;
4598          depth_aux_usage =
4599             anv_layout_to_aux_usage(cmd_buffer->device->info,
4600                                     d_iview->image,
4601                                     VK_IMAGE_ASPECT_DEPTH_BIT,
4602                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
4603                                     depth_layout,
4604                                     cmd_buffer->queue_family->queueFlags);
4605          depth_clear_value = d_att->clearValue.depthStencil.depth;
4606       }
4607 
4608       if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
4609          s_iview = anv_image_view_from_handle(s_att->imageView);
4610          initial_stencil_layout = attachment_initial_layout(s_att);
4611          stencil_layout = s_att->imageLayout;
4612          stencil_aux_usage =
4613             anv_layout_to_aux_usage(cmd_buffer->device->info,
4614                                     s_iview->image,
4615                                     VK_IMAGE_ASPECT_STENCIL_BIT,
4616                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
4617                                     stencil_layout,
4618                                     cmd_buffer->queue_family->queueFlags);
4619          stencil_clear_value = s_att->clearValue.depthStencil.stencil;
4620       }
4621 
4622       assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
4623       ds_iview = d_iview != NULL ? d_iview : s_iview;
4624       assert(ds_iview != NULL);
4625 
4626       assert(render_area.offset.x + render_area.extent.width <=
4627              ds_iview->vk.extent.width);
4628       assert(render_area.offset.y + render_area.extent.height <=
4629              ds_iview->vk.extent.height);
4630       assert(layers <= ds_iview->vk.layer_count);
4631 
4632       fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
4633       fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
4634 
4635       assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
4636       gfx->samples |= ds_iview->vk.image->samples;
4637 
4638       VkImageAspectFlags clear_aspects = 0;
4639       if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
4640           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
4641          clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
4642       if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
4643           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
4644          clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4645 
4646       if (clear_aspects != 0) {
4647          const bool hiz_clear =
4648             anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
4649                                       depth_layout, clear_aspects,
4650                                       depth_clear_value,
4651                                       render_area,
4652                                       cmd_buffer->queue_family->queueFlags);
4653 
4654          if (depth_layout != initial_depth_layout) {
4655             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
4656                    render_area.extent.width == d_iview->vk.extent.width &&
4657                    render_area.extent.height == d_iview->vk.extent.height);
4658 
4659             if (is_multiview) {
4660                u_foreach_bit(view, gfx->view_mask) {
4661                   transition_depth_buffer(cmd_buffer, d_iview->image,
4662                                           d_iview->vk.base_array_layer + view,
4663                                           1 /* layer_count */,
4664                                           initial_depth_layout, depth_layout,
4665                                           hiz_clear);
4666                }
4667             } else {
4668                transition_depth_buffer(cmd_buffer, d_iview->image,
4669                                        d_iview->vk.base_array_layer,
4670                                        gfx->layer_count,
4671                                        initial_depth_layout, depth_layout,
4672                                        hiz_clear);
4673             }
4674          }
4675 
4676          if (stencil_layout != initial_stencil_layout) {
4677             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
4678                    render_area.extent.width == s_iview->vk.extent.width &&
4679                    render_area.extent.height == s_iview->vk.extent.height);
4680 
4681             if (is_multiview) {
4682                u_foreach_bit(view, gfx->view_mask) {
4683                   transition_stencil_buffer(cmd_buffer, s_iview->image,
4684                                             s_iview->vk.base_mip_level, 1,
4685                                             s_iview->vk.base_array_layer + view,
4686                                             1 /* layer_count */,
4687                                             initial_stencil_layout,
4688                                             stencil_layout,
4689                                             hiz_clear);
4690                }
4691             } else {
4692                transition_stencil_buffer(cmd_buffer, s_iview->image,
4693                                          s_iview->vk.base_mip_level, 1,
4694                                          s_iview->vk.base_array_layer,
4695                                          gfx->layer_count,
4696                                          initial_stencil_layout,
4697                                          stencil_layout,
4698                                          hiz_clear);
4699             }
4700          }
4701 
4702          if (is_multiview) {
4703             uint32_t clear_view_mask = pRenderingInfo->viewMask;
4704             while (clear_view_mask) {
4705                int view = u_bit_scan(&clear_view_mask);
4706 
4707                uint32_t level = ds_iview->vk.base_mip_level;
4708                uint32_t layer = ds_iview->vk.base_array_layer + view;
4709 
4710                if (hiz_clear) {
4711                   anv_image_hiz_clear(cmd_buffer, ds_iview->image,
4712                                       clear_aspects,
4713                                       level, layer, 1,
4714                                       render_area,
4715                                       stencil_clear_value);
4716                } else {
4717                   anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
4718                                                 clear_aspects,
4719                                                 depth_aux_usage,
4720                                                 level, layer, 1,
4721                                                 render_area,
4722                                                 depth_clear_value,
4723                                                 stencil_clear_value);
4724                }
4725             }
4726          } else {
4727             uint32_t level = ds_iview->vk.base_mip_level;
4728             uint32_t base_layer = ds_iview->vk.base_array_layer;
4729             uint32_t layer_count = gfx->layer_count;
4730 
4731             if (hiz_clear) {
4732                anv_image_hiz_clear(cmd_buffer, ds_iview->image,
4733                                    clear_aspects,
4734                                    level, base_layer, layer_count,
4735                                    render_area,
4736                                    stencil_clear_value);
4737             } else {
4738                anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
4739                                              clear_aspects,
4740                                              depth_aux_usage,
4741                                              level, base_layer, layer_count,
4742                                              render_area,
4743                                              depth_clear_value,
4744                                              stencil_clear_value);
4745             }
4746          }
4747       } else {
4748          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
4749          assert(depth_layout == initial_depth_layout);
4750          assert(stencil_layout == initial_stencil_layout);
4751       }
4752 
4753       if (d_iview != NULL) {
4754          gfx->depth_att.vk_format = d_iview->vk.format;
4755          gfx->depth_att.iview = d_iview;
4756          gfx->depth_att.layout = depth_layout;
4757          gfx->depth_att.aux_usage = depth_aux_usage;
4758          if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
4759             assert(d_att->resolveImageView != VK_NULL_HANDLE);
4760             gfx->depth_att.resolve_mode = d_att->resolveMode;
4761             gfx->depth_att.resolve_iview =
4762                anv_image_view_from_handle(d_att->resolveImageView);
4763             gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
4764          }
4765       }
4766 
4767       if (s_iview != NULL) {
4768          gfx->stencil_att.vk_format = s_iview->vk.format;
4769          gfx->stencil_att.iview = s_iview;
4770          gfx->stencil_att.layout = stencil_layout;
4771          gfx->stencil_att.aux_usage = stencil_aux_usage;
4772          if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
4773             assert(s_att->resolveImageView != VK_NULL_HANDLE);
4774             gfx->stencil_att.resolve_mode = s_att->resolveMode;
4775             gfx->stencil_att.resolve_iview =
4776                anv_image_view_from_handle(s_att->resolveImageView);
4777             gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
4778          }
4779       }
4780    }
4781 
4782    /* Finally, now that we know the right size, set up the null surface */
4783    assert(util_bitcount(gfx->samples) <= 1);
4784    isl_null_fill_state(&cmd_buffer->device->isl_dev,
4785                        gfx->null_surface_state.map,
4786                        .size = fb_size);
4787 
4788    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
4789       if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
4790          continue;
4791 
4792       isl_null_fill_state(&cmd_buffer->device->isl_dev,
4793                           gfx->color_att[i].surface_state.state.map,
4794                           .size = fb_size);
4795    }
4796 
4797    /****** We can now start emitting code to begin the render pass ******/
4798 
4799    gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
4800 
4801    /* It is possible to start a render pass with an old pipeline.  Because the
4802     * render pass and subpass index are both baked into the pipeline, this is
4803     * highly unlikely.  In order to do so, it requires that you have a render
4804     * pass with a single subpass and that you use that render pass twice
4805     * back-to-back and use the same pipeline at the start of the second render
4806     * pass as at the end of the first.  In order to avoid unpredictable issues
4807     * with this edge case, we just dirty the pipeline at the start of every
4808     * subpass.
4809     */
4810    gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
4811 
4812 #if GFX_VER >= 11
4813    bool has_color_att = false;
4814    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
4815       if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE) {
4816          has_color_att = true;
4817          break;
4818       }
4819    }
4820    if (has_color_att) {
4821       /* The PIPE_CONTROL command description says:
4822       *
4823       *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
4824       *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
4825       *     Target Cache Flush by enabling this bit. When render target flush
4826       *     is set due to new association of BTI, PS Scoreboard Stall bit must
4827       *     be set in this packet."
4828       *
4829       * We assume that a new BeginRendering is always changing the RTs, which
4830       * may not be true and cause excessive flushing.  We can trivially skip it
4831       * in the case that there are no RTs (depth-only rendering), though.
4832       */
4833       anv_add_pending_pipe_bits(cmd_buffer,
4834                               ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4835                               ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
4836                               "change RT");
4837    }
4838 #endif
4839 
4840    cmd_buffer_emit_depth_stencil(cmd_buffer);
4841 
4842    cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
4843 }
4844 
4845 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)4846 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
4847                                    struct anv_attachment *att,
4848                                    VkImageAspectFlagBits aspect)
4849 {
4850    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4851    const struct anv_image_view *iview = att->iview;
4852 
4853    if (iview == NULL)
4854       return;
4855 
4856    if (gfx->view_mask == 0) {
4857       genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
4858                                           aspect, att->aux_usage,
4859                                           iview->planes[0].isl.base_level,
4860                                           iview->planes[0].isl.base_array_layer,
4861                                           gfx->layer_count);
4862    } else {
4863       uint32_t res_view_mask = gfx->view_mask;
4864       while (res_view_mask) {
4865          int i = u_bit_scan(&res_view_mask);
4866 
4867          const uint32_t level = iview->planes[0].isl.base_level;
4868          const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
4869 
4870          genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
4871                                              aspect, att->aux_usage,
4872                                              level, layer, 1);
4873       }
4874    }
4875 }
4876 
4877 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)4878 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
4879 {
4880    switch (vk_mode) {
4881    case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
4882       return BLORP_FILTER_SAMPLE_0;
4883    case VK_RESOLVE_MODE_AVERAGE_BIT:
4884       return BLORP_FILTER_AVERAGE;
4885    case VK_RESOLVE_MODE_MIN_BIT:
4886       return BLORP_FILTER_MIN_SAMPLE;
4887    case VK_RESOLVE_MODE_MAX_BIT:
4888       return BLORP_FILTER_MAX_SAMPLE;
4889    default:
4890       return BLORP_FILTER_NONE;
4891    }
4892 }
4893 
4894 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer * cmd_buffer,const struct anv_attachment * att,VkImageLayout layout,VkImageAspectFlagBits aspect)4895 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
4896                                    const struct anv_attachment *att,
4897                                    VkImageLayout layout,
4898                                    VkImageAspectFlagBits aspect)
4899 {
4900    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4901    const struct anv_image_view *src_iview = att->iview;
4902    const struct anv_image_view *dst_iview = att->resolve_iview;
4903 
4904    enum isl_aux_usage src_aux_usage =
4905       anv_layout_to_aux_usage(cmd_buffer->device->info,
4906                               src_iview->image, aspect,
4907                               VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
4908                               layout,
4909                               cmd_buffer->queue_family->queueFlags);
4910 
4911    enum isl_aux_usage dst_aux_usage =
4912       anv_layout_to_aux_usage(cmd_buffer->device->info,
4913                               dst_iview->image, aspect,
4914                               VK_IMAGE_USAGE_TRANSFER_DST_BIT,
4915                               att->resolve_layout,
4916                               cmd_buffer->queue_family->queueFlags);
4917 
4918    enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
4919 
4920    const VkRect2D render_area = gfx->render_area;
4921    if (gfx->view_mask == 0) {
4922       anv_image_msaa_resolve(cmd_buffer,
4923                              src_iview->image, src_aux_usage,
4924                              src_iview->planes[0].isl.base_level,
4925                              src_iview->planes[0].isl.base_array_layer,
4926                              dst_iview->image, dst_aux_usage,
4927                              dst_iview->planes[0].isl.base_level,
4928                              dst_iview->planes[0].isl.base_array_layer,
4929                              aspect,
4930                              render_area.offset.x, render_area.offset.y,
4931                              render_area.offset.x, render_area.offset.y,
4932                              render_area.extent.width,
4933                              render_area.extent.height,
4934                              gfx->layer_count, filter);
4935    } else {
4936       uint32_t res_view_mask = gfx->view_mask;
4937       while (res_view_mask) {
4938          int i = u_bit_scan(&res_view_mask);
4939 
4940          anv_image_msaa_resolve(cmd_buffer,
4941                                 src_iview->image, src_aux_usage,
4942                                 src_iview->planes[0].isl.base_level,
4943                                 src_iview->planes[0].isl.base_array_layer + i,
4944                                 dst_iview->image, dst_aux_usage,
4945                                 dst_iview->planes[0].isl.base_level,
4946                                 dst_iview->planes[0].isl.base_array_layer + i,
4947                                 aspect,
4948                                 render_area.offset.x, render_area.offset.y,
4949                                 render_area.offset.x, render_area.offset.y,
4950                                 render_area.extent.width,
4951                                 render_area.extent.height,
4952                                 1, filter);
4953       }
4954    }
4955 }
4956 
genX(CmdEndRendering)4957 void genX(CmdEndRendering)(
4958     VkCommandBuffer                             commandBuffer)
4959 {
4960    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4961    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4962 
4963    if (anv_batch_has_error(&cmd_buffer->batch))
4964       return;
4965 
4966    const bool is_multiview = gfx->view_mask != 0;
4967    const uint32_t layers =
4968       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
4969 
4970    bool has_color_resolve = false;
4971    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
4972       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
4973                                          VK_IMAGE_ASPECT_COLOR_BIT);
4974 
4975       /* Stash this off for later */
4976       if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
4977           !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
4978          has_color_resolve = true;
4979    }
4980 
4981    cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
4982                                        VK_IMAGE_ASPECT_DEPTH_BIT);
4983 
4984    cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
4985                                        VK_IMAGE_ASPECT_STENCIL_BIT);
4986 
4987    if (has_color_resolve) {
4988       /* We are about to do some MSAA resolves.  We need to flush so that the
4989        * result of writes to the MSAA color attachments show up in the sampler
4990        * when we blit to the single-sampled resolve target.
4991        */
4992       anv_add_pending_pipe_bits(cmd_buffer,
4993                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4994                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
4995                                 "MSAA resolve");
4996    }
4997 
4998    if (!(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT) &&
4999        (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
5000         gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)) {
5001       /* We are about to do some MSAA resolves.  We need to flush so that the
5002        * result of writes to the MSAA depth attachments show up in the sampler
5003        * when we blit to the single-sampled resolve target.
5004        */
5005       anv_add_pending_pipe_bits(cmd_buffer,
5006                               ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5007                               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5008                               "MSAA resolve");
5009    }
5010 
5011    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5012       const struct anv_attachment *att = &gfx->color_att[i];
5013       if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
5014           (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5015          continue;
5016 
5017       cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
5018                                          VK_IMAGE_ASPECT_COLOR_BIT);
5019    }
5020 
5021    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5022        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5023       const struct anv_image_view *src_iview = gfx->depth_att.iview;
5024 
5025       /* MSAA resolves sample from the source attachment.  Transition the
5026        * depth attachment first to get rid of any HiZ that we may not be
5027        * able to handle.
5028        */
5029       transition_depth_buffer(cmd_buffer, src_iview->image,
5030                               src_iview->planes[0].isl.base_array_layer,
5031                               layers,
5032                               gfx->depth_att.layout,
5033                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5034                               false /* will_full_fast_clear */);
5035 
5036       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
5037                                          VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5038                                          VK_IMAGE_ASPECT_DEPTH_BIT);
5039 
5040       /* Transition the source back to the original layout.  This seems a bit
5041        * inefficient but, since HiZ resolves aren't destructive, going from
5042        * less HiZ to more is generally a no-op.
5043        */
5044       transition_depth_buffer(cmd_buffer, src_iview->image,
5045                               src_iview->planes[0].isl.base_array_layer,
5046                               layers,
5047                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5048                               gfx->depth_att.layout,
5049                               false /* will_full_fast_clear */);
5050    }
5051 
5052    if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5053        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5054       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
5055                                          gfx->stencil_att.layout,
5056                                          VK_IMAGE_ASPECT_STENCIL_BIT);
5057    }
5058 
5059 
5060    trace_intel_end_render_pass(&cmd_buffer->trace,
5061                                gfx->render_area.extent.width,
5062                                gfx->render_area.extent.height,
5063                                gfx->color_att_count,
5064                                gfx->samples);
5065 
5066    anv_cmd_buffer_reset_rendering(cmd_buffer);
5067 }
5068 
5069 void
genX(cmd_emit_conditional_render_predicate)5070 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5071 {
5072    struct mi_builder b;
5073    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5074 
5075    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5076                 mi_reg32(ANV_PREDICATE_RESULT_REG));
5077    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5078 
5079    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5080       mip.LoadOperation    = LOAD_LOADINV;
5081       mip.CombineOperation = COMBINE_SET;
5082       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5083    }
5084 }
5085 
genX(CmdBeginConditionalRenderingEXT)5086 void genX(CmdBeginConditionalRenderingEXT)(
5087    VkCommandBuffer                             commandBuffer,
5088    const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
5089 {
5090    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5091    ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5092    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5093    struct anv_address value_address =
5094       anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5095 
5096    const bool isInverted = pConditionalRenderingBegin->flags &
5097                            VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5098 
5099    cmd_state->conditional_render_enabled = true;
5100 
5101    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5102 
5103    struct mi_builder b;
5104    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5105    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &value_address);
5106    mi_builder_set_mocs(&b, mocs);
5107 
5108    /* Section 19.4 of the Vulkan 1.1.85 spec says:
5109     *
5110     *    If the value of the predicate in buffer memory changes
5111     *    while conditional rendering is active, the rendering commands
5112     *    may be discarded in an implementation-dependent way.
5113     *    Some implementations may latch the value of the predicate
5114     *    upon beginning conditional rendering while others
5115     *    may read it before every rendering command.
5116     *
5117     * So it's perfectly fine to read a value from the buffer once.
5118     */
5119    struct mi_value value =  mi_mem32(value_address);
5120 
5121    /* Precompute predicate result, it is necessary to support secondary
5122     * command buffers since it is unknown if conditional rendering is
5123     * inverted when populating them.
5124     */
5125    mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5126                 isInverted ? mi_uge(&b, mi_imm(0), value) :
5127                              mi_ult(&b, mi_imm(0), value));
5128 }
5129 
genX(CmdEndConditionalRenderingEXT)5130 void genX(CmdEndConditionalRenderingEXT)(
5131 	VkCommandBuffer                             commandBuffer)
5132 {
5133    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5134    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5135 
5136    cmd_state->conditional_render_enabled = false;
5137 }
5138 
5139 /* Set of stage bits for which are pipelined, i.e. they get queued
5140  * by the command streamer for later execution.
5141  */
5142 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5143    ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5144      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5145      VK_PIPELINE_STAGE_2_HOST_BIT | \
5146      VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5147 
genX(CmdSetEvent2)5148 void genX(CmdSetEvent2)(
5149     VkCommandBuffer                             commandBuffer,
5150     VkEvent                                     _event,
5151     const VkDependencyInfo*                     pDependencyInfo)
5152 {
5153    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5154    ANV_FROM_HANDLE(anv_event, event, _event);
5155 
5156    if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
5157       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5158          flush.PostSyncOperation = WriteImmediateData;
5159          flush.Address = anv_state_pool_state_address(
5160             &cmd_buffer->device->dynamic_state_pool,
5161             event->state);
5162          flush.ImmediateData = VK_EVENT_SET;
5163       }
5164       return;
5165    }
5166 
5167    VkPipelineStageFlags2 src_stages = 0;
5168 
5169    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
5170       src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
5171    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
5172       src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
5173    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
5174       src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
5175 
5176    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5177    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5178 
5179    enum anv_pipe_bits pc_bits = 0;
5180    if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5181       pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
5182       pc_bits |= ANV_PIPE_CS_STALL_BIT;
5183   }
5184 
5185    genx_batch_emit_pipe_control_write
5186       (&cmd_buffer->batch, cmd_buffer->device->info,
5187        cmd_buffer->state.current_pipeline, WriteImmediateData,
5188        anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5189                                     event->state),
5190        VK_EVENT_SET, pc_bits);
5191 }
5192 
genX(CmdResetEvent2)5193 void genX(CmdResetEvent2)(
5194     VkCommandBuffer                             commandBuffer,
5195     VkEvent                                     _event,
5196     VkPipelineStageFlags2                       stageMask)
5197 {
5198    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5199    ANV_FROM_HANDLE(anv_event, event, _event);
5200 
5201    if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
5202       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5203          flush.PostSyncOperation = WriteImmediateData;
5204          flush.Address = anv_state_pool_state_address(
5205             &cmd_buffer->device->dynamic_state_pool,
5206             event->state);
5207          flush.ImmediateData = VK_EVENT_RESET;
5208       }
5209       return;
5210    }
5211 
5212    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5213    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5214 
5215    enum anv_pipe_bits pc_bits = 0;
5216    if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5217       pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
5218       pc_bits |= ANV_PIPE_CS_STALL_BIT;
5219     }
5220 
5221    genx_batch_emit_pipe_control_write
5222       (&cmd_buffer->batch, cmd_buffer->device->info,
5223        cmd_buffer->state.current_pipeline, WriteImmediateData,
5224        anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5225                                     event->state),
5226        VK_EVENT_RESET,
5227        pc_bits);
5228 }
5229 
genX(CmdWaitEvents2)5230 void genX(CmdWaitEvents2)(
5231     VkCommandBuffer                             commandBuffer,
5232     uint32_t                                    eventCount,
5233     const VkEvent*                              pEvents,
5234     const VkDependencyInfo*                     pDependencyInfos)
5235 {
5236    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5237 
5238    for (uint32_t i = 0; i < eventCount; i++) {
5239       ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
5240 
5241       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5242          sem.WaitMode            = PollingMode;
5243          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
5244          sem.SemaphoreDataDword  = VK_EVENT_SET;
5245          sem.SemaphoreAddress    = anv_state_pool_state_address(
5246             &cmd_buffer->device->dynamic_state_pool,
5247             event->state);
5248       }
5249    }
5250 
5251    cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
5252 }
5253 
vk_to_intel_index_type(VkIndexType type)5254 static uint32_t vk_to_intel_index_type(VkIndexType type)
5255 {
5256    switch (type) {
5257    case VK_INDEX_TYPE_UINT8_KHR:
5258       return INDEX_BYTE;
5259    case VK_INDEX_TYPE_UINT16:
5260       return INDEX_WORD;
5261    case VK_INDEX_TYPE_UINT32:
5262       return INDEX_DWORD;
5263    default:
5264       unreachable("invalid index type");
5265    }
5266 }
5267 
genX(CmdBindIndexBuffer2KHR)5268 void genX(CmdBindIndexBuffer2KHR)(
5269     VkCommandBuffer                             commandBuffer,
5270     VkBuffer                                    _buffer,
5271     VkDeviceSize                                offset,
5272     VkDeviceSize                                size,
5273     VkIndexType                                 indexType)
5274 {
5275    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5276    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5277 
5278    uint32_t restart_index = vk_index_to_restart(indexType);
5279    if (cmd_buffer->state.gfx.restart_index != restart_index) {
5280       cmd_buffer->state.gfx.restart_index = restart_index;
5281       cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RESTART_INDEX;
5282    }
5283 
5284    uint32_t index_type = vk_to_intel_index_type(indexType);
5285    if (cmd_buffer->state.gfx.index_buffer != buffer ||
5286        cmd_buffer->state.gfx.index_type != index_type ||
5287        cmd_buffer->state.gfx.index_offset != offset) {
5288       cmd_buffer->state.gfx.index_buffer = buffer;
5289       cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
5290       cmd_buffer->state.gfx.index_offset = offset;
5291       cmd_buffer->state.gfx.index_size = buffer ? vk_buffer_range(&buffer->vk, offset, size) : 0;
5292       cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
5293    }
5294 }
5295 
genX(CmdSetPerformanceOverrideINTEL)5296 VkResult genX(CmdSetPerformanceOverrideINTEL)(
5297     VkCommandBuffer                             commandBuffer,
5298     const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
5299 {
5300    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5301 
5302    switch (pOverrideInfo->type) {
5303    case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
5304       anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
5305          csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
5306          csdm2.MediaInstructionDisable = pOverrideInfo->enable;
5307          csdm2._3DRenderingInstructionDisableMask = true;
5308          csdm2.MediaInstructionDisableMask = true;
5309       }
5310       break;
5311    }
5312 
5313    case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
5314       if (pOverrideInfo->enable) {
5315          /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
5316          anv_add_pending_pipe_bits(cmd_buffer,
5317                                    ANV_PIPE_FLUSH_BITS |
5318                                    ANV_PIPE_INVALIDATE_BITS,
5319                                    "perf counter isolation");
5320          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5321       }
5322       break;
5323 
5324    default:
5325       unreachable("Invalid override");
5326    }
5327 
5328    return VK_SUCCESS;
5329 }
5330 
genX(CmdSetPerformanceStreamMarkerINTEL)5331 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
5332     VkCommandBuffer                             commandBuffer,
5333     const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
5334 {
5335    /* TODO: Waiting on the register to write, might depend on generation. */
5336 
5337    return VK_SUCCESS;
5338 }
5339 
5340 #define TIMESTAMP 0x2358
5341 
genX(cmd_emit_timestamp)5342 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
5343                               struct anv_device *device,
5344                               struct anv_address addr,
5345                               enum anv_timestamp_capture_type type,
5346                               void *data) {
5347    /* Make sure ANV_TIMESTAMP_CAPTURE_AT_CS_STALL and
5348     * ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER capture type are not set for
5349     * transfer queue.
5350     */
5351    if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
5352        (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
5353       assert(type != ANV_TIMESTAMP_CAPTURE_AT_CS_STALL &&
5354              type != ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER);
5355    }
5356 
5357    switch (type) {
5358    case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
5359       struct mi_builder b;
5360       mi_builder_init(&b, device->info, batch);
5361       mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
5362       break;
5363    }
5364 
5365    case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: {
5366       if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
5367           (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
5368          /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
5369          if (intel_needs_workaround(device->info, 16018063123))
5370             genX(batch_emit_fast_color_dummy_blit)(batch, device);
5371          anv_batch_emit(batch, GENX(MI_FLUSH_DW), fd) {
5372             fd.PostSyncOperation = WriteTimestamp;
5373             fd.Address = addr;
5374          }
5375       } else {
5376          genx_batch_emit_pipe_control_write(batch, device->info, 0,
5377                                             WriteTimestamp, addr, 0, 0);
5378       }
5379       break;
5380    }
5381 
5382    case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
5383       genx_batch_emit_pipe_control_write
5384            (batch, device->info, 0, WriteTimestamp, addr, 0,
5385             ANV_PIPE_CS_STALL_BIT);
5386       break;
5387 
5388 #if GFX_VERx10 >= 125
5389    case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
5390       uint32_t dwords[GENX(COMPUTE_WALKER_length)];
5391 
5392       GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
5393             .PostSync = (struct GENX(POSTSYNC_DATA)) {
5394                .Operation = WriteTimestamp,
5395                .DestinationAddress = addr,
5396                .MOCS = anv_mocs(device, NULL, 0),
5397             },
5398          });
5399 
5400       for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
5401          ((uint32_t *)data)[i] |= dwords[i];
5402       break;
5403    }
5404 
5405    case ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH: {
5406       uint32_t dwords[GENX(EXECUTE_INDIRECT_DISPATCH_length)];
5407 
5408       GENX(EXECUTE_INDIRECT_DISPATCH_pack)
5409       (batch, dwords, &(struct GENX(EXECUTE_INDIRECT_DISPATCH)) {
5410             .MOCS = anv_mocs(device, NULL, 0),
5411             .COMPUTE_WALKER_BODY = {
5412                .PostSync = (struct GENX(POSTSYNC_DATA)) {
5413                   .Operation = WriteTimestamp,
5414                   .DestinationAddress = addr,
5415                   .MOCS = anv_mocs(device, NULL, 0),
5416                },
5417             }
5418       });
5419 
5420       for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
5421          ((uint32_t *)data)[i] |= dwords[i];
5422       break;
5423    }
5424 #endif
5425 
5426    default:
5427       unreachable("invalid");
5428    }
5429 }
5430 
genX(batch_emit_secondary_call)5431 void genX(batch_emit_secondary_call)(struct anv_batch *batch,
5432                                      struct anv_address secondary_addr,
5433                                      struct anv_address secondary_return_addr)
5434 {
5435    /* Emit a write to change the return address of the secondary */
5436    uint64_t *write_return_addr =
5437       anv_batch_emitn(batch,
5438                       GENX(MI_STORE_DATA_IMM_length) + 1 /* QWord write */,
5439                       GENX(MI_STORE_DATA_IMM),
5440 #if GFX_VER >= 12
5441                       .ForceWriteCompletionCheck = true,
5442 #endif
5443                       .Address = secondary_return_addr) +
5444       GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 8;
5445 
5446 #if GFX_VER >= 12
5447    /* Disable prefetcher before jumping into a secondary */
5448    anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
5449       arb.PreParserDisableMask = true;
5450       arb.PreParserDisable = true;
5451    }
5452 #endif
5453 
5454    /* Jump into the secondary */
5455    anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
5456       bbs.AddressSpaceIndicator = ASI_PPGTT;
5457       bbs.SecondLevelBatchBuffer = Firstlevelbatch;
5458       bbs.BatchBufferStartAddress = secondary_addr;
5459    }
5460 
5461    /* Replace the return address written by the MI_STORE_DATA_IMM above with
5462     * the primary's current batch address (immediately after the jump).
5463     */
5464    *write_return_addr =
5465       anv_address_physical(anv_batch_current_address(batch));
5466 }
5467 
5468 void *
genX(batch_emit_return)5469 genX(batch_emit_return)(struct anv_batch *batch)
5470 {
5471    return anv_batch_emitn(batch,
5472                           GENX(MI_BATCH_BUFFER_START_length),
5473                           GENX(MI_BATCH_BUFFER_START),
5474                           .AddressSpaceIndicator = ASI_PPGTT,
5475                           .SecondLevelBatchBuffer = Firstlevelbatch);
5476 }
5477 
5478 void
genX(batch_emit_post_3dprimitive_was)5479 genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
5480                                       const struct anv_device *device,
5481                                       uint32_t primitive_topology,
5482                                       uint32_t vertex_count)
5483 {
5484 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
5485    if (intel_needs_workaround(device->info, 22014412737) &&
5486        (primitive_topology == _3DPRIM_POINTLIST ||
5487         primitive_topology == _3DPRIM_LINELIST ||
5488         primitive_topology == _3DPRIM_LINESTRIP ||
5489         primitive_topology == _3DPRIM_LINELIST_ADJ ||
5490         primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
5491         primitive_topology == _3DPRIM_LINELOOP ||
5492         primitive_topology == _3DPRIM_POINTLIST_BF ||
5493         primitive_topology == _3DPRIM_LINESTRIP_CONT ||
5494         primitive_topology == _3DPRIM_LINESTRIP_BF ||
5495         primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
5496        (vertex_count == 1 || vertex_count == 2)) {
5497       genx_batch_emit_pipe_control_write
5498          (batch, device->info, 0, WriteImmediateData,
5499           device->workaround_address, 0, 0);
5500 
5501       /* Reset counter because we just emitted a PC */
5502       batch->num_3d_primitives_emitted = 0;
5503    } else if (intel_needs_workaround(device->info, 16014538804)) {
5504       batch->num_3d_primitives_emitted++;
5505       /* WA 16014538804:
5506        *    After every 3 3D_Primitive command,
5507        *    atleast 1 pipe_control must be inserted.
5508        */
5509       if (batch->num_3d_primitives_emitted == 3) {
5510          anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
5511          batch->num_3d_primitives_emitted = 0;
5512       }
5513    }
5514 #endif
5515 }
5516 
5517 /* Wa_16018063123 */
5518 ALWAYS_INLINE void
genX(batch_emit_fast_color_dummy_blit)5519 genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
5520                                       struct anv_device *device)
5521 {
5522 #if GFX_VERx10 >= 125
5523    anv_batch_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
5524       blt.DestinationBaseAddress = device->workaround_address;
5525       blt.DestinationMOCS = device->isl_dev.mocs.blitter_dst;
5526       blt.DestinationPitch = 63;
5527       blt.DestinationX2 = 1;
5528       blt.DestinationY2 = 4;
5529       blt.DestinationSurfaceWidth = 1;
5530       blt.DestinationSurfaceHeight = 4;
5531       blt.DestinationSurfaceType = XY_SURFTYPE_2D;
5532       blt.DestinationSurfaceQPitch = 4;
5533       blt.DestinationTiling = XY_TILE_LINEAR;
5534    }
5535 #endif
5536 }
5537 
5538 void
genX(urb_workaround)5539 genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
5540                      const struct intel_urb_config *urb_cfg)
5541 {
5542 #if INTEL_NEEDS_WA_16014912113
5543    const struct intel_urb_config *current =
5544       &cmd_buffer->state.gfx.urb_cfg;
5545    if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) &&
5546        current->size[0] != 0) {
5547       for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
5548          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
5549             urb._3DCommandSubOpcode      += i;
5550             urb.VSURBStartingAddress      = current->start[i];
5551             urb.VSURBEntryAllocationSize  = current->size[i] - 1;
5552             urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
5553          }
5554       }
5555       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5556          pc.HDCPipelineFlushEnable = true;
5557       }
5558    }
5559 #endif
5560 }
5561 
5562 struct anv_state
genX(cmd_buffer_begin_companion_rcs_syncpoint)5563 genX(cmd_buffer_begin_companion_rcs_syncpoint)(
5564       struct anv_cmd_buffer   *cmd_buffer)
5565 {
5566 #if GFX_VERx10 >= 125
5567    const struct intel_device_info *info = cmd_buffer->device->info;
5568    struct anv_state syncpoint =
5569       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 2 * sizeof(uint32_t), 4);
5570    struct anv_address xcs_wait_addr =
5571       anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5572                                    syncpoint);
5573    struct anv_address rcs_wait_addr = anv_address_add(xcs_wait_addr, 4);
5574 
5575    /* Reset the sync point */
5576    memset(syncpoint.map, 0, 2 * sizeof(uint32_t));
5577 
5578    struct mi_builder b;
5579 
5580    /* On CCS:
5581     *    - flush all caches & invalidate
5582     *    - unblock RCS
5583     *    - wait on RCS to complete
5584     *    - clear the value we waited on
5585     */
5586 
5587    if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
5588       anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_FLUSH_BITS |
5589                                             ANV_PIPE_INVALIDATE_BITS |
5590                                             ANV_PIPE_STALL_BITS,
5591                                 "post main cmd buffer invalidate");
5592       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5593    } else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
5594       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
5595       if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
5596          genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
5597                                                 cmd_buffer->device);
5598       }
5599       anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
5600          fd.FlushCCS = true; /* Maybe handle Flush LLC */
5601       }
5602    }
5603 
5604    {
5605       mi_builder_init(&b, info, &cmd_buffer->batch);
5606       mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x1));
5607       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5608          sem.WaitMode            = PollingMode;
5609          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
5610          sem.SemaphoreDataDword  = 0x1;
5611          sem.SemaphoreAddress    = xcs_wait_addr;
5612       }
5613       /* Make sure to reset the semaphore in case the command buffer is run
5614        * multiple times.
5615        */
5616       mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x0));
5617    }
5618 
5619    /* On RCS:
5620     *    - wait on CCS signal
5621     *    - clear the value we waited on
5622     */
5623    {
5624       mi_builder_init(&b, info, &cmd_buffer->companion_rcs_cmd_buffer->batch);
5625       anv_batch_emit(&cmd_buffer->companion_rcs_cmd_buffer->batch,
5626                      GENX(MI_SEMAPHORE_WAIT),
5627                      sem) {
5628          sem.WaitMode            = PollingMode;
5629          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
5630          sem.SemaphoreDataDword  = 0x1;
5631          sem.SemaphoreAddress    = rcs_wait_addr;
5632       }
5633       /* Make sure to reset the semaphore in case the command buffer is run
5634        * multiple times.
5635        */
5636       mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x0));
5637    }
5638 
5639    return syncpoint;
5640 #else
5641    unreachable("Not implemented");
5642 #endif
5643 }
5644 
5645 void
genX(cmd_buffer_end_companion_rcs_syncpoint)5646 genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
5647                                              struct anv_state syncpoint)
5648 {
5649 #if GFX_VERx10 >= 125
5650    struct anv_address xcs_wait_addr =
5651       anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5652                                    syncpoint);
5653 
5654    struct mi_builder b;
5655 
5656    /* On RCS:
5657     *    - flush all caches & invalidate
5658     *    - unblock the CCS
5659     */
5660    anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
5661                              ANV_PIPE_FLUSH_BITS |
5662                              ANV_PIPE_INVALIDATE_BITS |
5663                              ANV_PIPE_STALL_BITS,
5664                              "post rcs flush");
5665    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer->companion_rcs_cmd_buffer);
5666 
5667    mi_builder_init(&b, cmd_buffer->device->info,
5668                    &cmd_buffer->companion_rcs_cmd_buffer->batch);
5669    mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x1));
5670 #else
5671    unreachable("Not implemented");
5672 #endif
5673 }
5674 
5675 VkResult
genX(write_trtt_entries)5676 genX(write_trtt_entries)(struct anv_trtt_submission *submit)
5677 {
5678 #if GFX_VER >= 12
5679    size_t batch_size = submit->l3l2_binds_len * 20 +
5680                        submit->l1_binds_len * 16 + 8;
5681    STACK_ARRAY(uint32_t, cmds, batch_size);
5682    struct anv_batch batch = {
5683       .start = cmds,
5684       .next = cmds,
5685       .end = (void *)cmds + batch_size,
5686    };
5687 
5688    /* BSpec says:
5689     *   "DWord Length programmed must not exceed 0x3FE."
5690     * For a single dword write the programmed length is 2, and for a single
5691     * qword it's 3. This is the value we actually write to the register field,
5692     * so it's not considering the bias.
5693     */
5694    uint32_t dword_write_len = 2;
5695    uint32_t qword_write_len = 3;
5696    uint32_t max_dword_extra_writes = 0x3FE - dword_write_len;
5697    uint32_t max_qword_extra_writes = (0x3FE - qword_write_len) / 2;
5698 
5699    /* What makes the code below quite complicated is the fact that we can
5700     * write multiple values with MI_STORE_DATA_IMM as long as the writes go to
5701     * contiguous addresses.
5702     */
5703 
5704    for (int i = 0; i < submit->l3l2_binds_len; i++) {
5705       int extra_writes = 0;
5706       for (int j = i + 1;
5707            j < submit->l3l2_binds_len &&
5708             extra_writes <= max_qword_extra_writes;
5709            j++) {
5710          if (submit->l3l2_binds[i].pte_addr + (j - i) * 8 ==
5711              submit->l3l2_binds[j].pte_addr) {
5712             extra_writes++;
5713          } else {
5714             break;
5715          }
5716       }
5717       bool is_last_write = submit->l1_binds_len == 0 &&
5718                            i + extra_writes + 1 == submit->l3l2_binds_len;
5719 
5720       uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
5721                            qword_write_len + (extra_writes * 2);
5722       uint32_t *dw;
5723       dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
5724          .ForceWriteCompletionCheck = is_last_write,
5725          .StoreQword = true,
5726          .Address = anv_address_from_u64(submit->l3l2_binds[i].pte_addr),
5727       );
5728       dw += 3;
5729       for (int j = 0; j < extra_writes + 1; j++) {
5730          uint64_t entry_addr_64b = submit->l3l2_binds[i + j].entry_addr;
5731          *dw = entry_addr_64b & 0xFFFFFFFF;
5732          dw++;
5733          *dw = (entry_addr_64b >> 32) & 0xFFFFFFFF;
5734          dw++;
5735       }
5736       assert(dw == batch.next);
5737 
5738       i += extra_writes;
5739    }
5740 
5741    for (int i = 0; i < submit->l1_binds_len; i++) {
5742       int extra_writes = 0;
5743       for (int j = i + 1;
5744            j < submit->l1_binds_len && extra_writes <= max_dword_extra_writes;
5745            j++) {
5746          if (submit->l1_binds[i].pte_addr + (j - i) * 4 ==
5747              submit->l1_binds[j].pte_addr) {
5748             extra_writes++;
5749          } else {
5750             break;
5751          }
5752       }
5753 
5754       bool is_last_write = i + extra_writes + 1 == submit->l1_binds_len;
5755 
5756       uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
5757                            dword_write_len + extra_writes;
5758       uint32_t *dw;
5759       dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
5760          .ForceWriteCompletionCheck = is_last_write,
5761          .Address = anv_address_from_u64(submit->l1_binds[i].pte_addr),
5762       );
5763       dw += 3;
5764       for (int j = 0; j < extra_writes + 1; j++) {
5765          *dw = (submit->l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
5766          dw++;
5767       }
5768       assert(dw == batch.next);
5769 
5770       i += extra_writes;
5771    }
5772 
5773    anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
5774 
5775    assert(batch.next <= batch.end);
5776 
5777    VkResult result = anv_queue_submit_trtt_batch(submit->sparse, &batch);
5778    STACK_ARRAY_FINISH(cmds);
5779 
5780    return result;
5781 
5782 #endif
5783    return VK_SUCCESS;
5784 }
5785 
5786 void
genX(CmdWriteBufferMarker2AMD)5787 genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer,
5788                                VkPipelineStageFlags2 stage,
5789                                VkBuffer dstBuffer,
5790                                VkDeviceSize dstOffset,
5791                                uint32_t marker)
5792 {
5793    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5794    ANV_FROM_HANDLE(anv_buffer, buffer, dstBuffer);
5795 
5796    /* The barriers inserted by the application to make dstBuffer writable
5797     * should already have the L1/L2 cache flushes. On platforms where the
5798     * command streamer is not coherent with L3, we need an additional set of
5799     * cache flushes.
5800     */
5801    enum anv_pipe_bits bits =
5802       (ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info) ? 0 :
5803        (ANV_PIPE_DATA_CACHE_FLUSH_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT)) |
5804       ANV_PIPE_END_OF_PIPE_SYNC_BIT;
5805 
5806    trace_intel_begin_write_buffer_marker(&cmd_buffer->trace);
5807 
5808    anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker");
5809    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5810 
5811    struct mi_builder b;
5812    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5813 
5814    /* Emitting a PIPE_CONTROL with Post-Sync Op = Write Immediate Data
5815     * would be the logical way to implement this extension, as it could
5816     * do a pipelined marker write.  Unfortunately, it requires writing
5817     * whole 64-bit QWords, and VK_AMD_buffer_marker requires writing a
5818     * 32-bit value.  MI_STORE_DATA_IMM is the only good way to do that,
5819     * and unfortunately it requires stalling.
5820     */
5821    mi_store(&b, mi_mem32(anv_address_add(buffer->address, dstOffset)),
5822                 mi_imm(marker));
5823 
5824    trace_intel_end_write_buffer_marker(&cmd_buffer->trace);
5825 }
5826