1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31
32 #include "util/format_srgb.h"
33
34 #include "genxml/gen_macros.h"
35 #include "genxml/genX_pack.h"
36
37 #include "ds/intel_tracepoints.h"
38
39 #include "genX_mi_builder.h"
40 #include "genX_cmd_draw_generated_flush.h"
41
42 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
43 uint32_t pipeline);
44
45 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)46 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
47 enum anv_pipe_bits bits = 0;
48 bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
49 bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
50 #if GFX_VERx10 >= 125
51 bits |= (pc->PSSStallSyncEnable) ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
52 #endif
53 #if GFX_VER == 12
54 bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
55 bits |= (pc->L3FabricFlush) ? ANV_PIPE_L3_FABRIC_FLUSH_BIT : 0;
56 #endif
57 #if GFX_VER >= 12
58 bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
59 #endif
60 bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
61 bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
62 bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
63 bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
64 bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
65 bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
66 bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
67 bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;
68 bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;
69 #if GFX_VERx10 == 125
70 bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0;
71 bits |= (pc->CCSFlushEnable) ? ANV_PIPE_CCS_CACHE_FLUSH_BIT : 0;
72 #endif
73 return bits;
74 }
75
76 #define anv_debug_dump_pc(pc, reason) \
77 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
78 fputs("pc : emit PC=( ", stdout); \
79 anv_dump_pipe_bits(convert_pc_to_bits(&(pc)), stdout); \
80 fprintf(stdout, ") reason: %s\n", reason); \
81 }
82
83 static inline void
fill_state_base_addr(struct anv_cmd_buffer * cmd_buffer,struct GENX (STATE_BASE_ADDRESS)* sba)84 fill_state_base_addr(struct anv_cmd_buffer *cmd_buffer,
85 struct GENX(STATE_BASE_ADDRESS) *sba)
86 {
87 struct anv_device *device = cmd_buffer->device;
88 const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
89
90 /* If no API entry point selected the current mode (this can happen if the
91 * first operation in the command buffer is a , select BUFFER if
92 * EXT_descriptor_buffer is enabled, otherwise LEGACY.
93 */
94 if (cmd_buffer->state.pending_db_mode ==
95 ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN) {
96 cmd_buffer->state.pending_db_mode =
97 cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer ?
98 ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER :
99 ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
100 }
101
102 *sba = (struct GENX(STATE_BASE_ADDRESS)) { GENX(STATE_BASE_ADDRESS_header), };
103
104 sba->GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
105 sba->GeneralStateMOCS = mocs;
106 sba->GeneralStateBufferSize = 0xfffff;
107 sba->GeneralStateBaseAddressModifyEnable = true;
108 sba->GeneralStateBufferSizeModifyEnable = true;
109
110 #if GFX_VERx10 == 120
111 /* Since DG2, scratch surfaces have their own surface state with its own
112 * MOCS setting, but prior to that, the MOCS for scratch accesses are
113 * governed by SBA.StatelessDataPortAccessMOCS.
114 */
115 const isl_surf_usage_flags_t protected_usage =
116 cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT ?
117 ISL_SURF_USAGE_PROTECTED_BIT : 0;
118 const uint32_t stateless_mocs = isl_mocs(&device->isl_dev, protected_usage, false);
119 #else
120 const uint32_t stateless_mocs = mocs;
121 #endif
122
123 sba->StatelessDataPortAccessMOCS = stateless_mocs;
124
125 #if GFX_VERx10 >= 125
126 sba->SurfaceStateBaseAddress =
127 (struct anv_address) { .offset =
128 device->physical->va.internal_surface_state_pool.addr,
129 };
130 #else
131 sba->SurfaceStateBaseAddress =
132 anv_cmd_buffer_surface_base_address(cmd_buffer);
133 #endif
134 sba->SurfaceStateMOCS = mocs;
135 sba->SurfaceStateBaseAddressModifyEnable = true;
136
137 sba->IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
138 sba->IndirectObjectMOCS = mocs;
139 sba->IndirectObjectBufferSize = 0xfffff;
140 sba->IndirectObjectBaseAddressModifyEnable = true;
141 sba->IndirectObjectBufferSizeModifyEnable = true;
142
143 sba->InstructionBaseAddress =
144 (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
145 sba->InstructionMOCS = mocs;
146 sba->InstructionBufferSize =
147 device->physical->va.instruction_state_pool.size / 4096;
148 sba->InstructionBaseAddressModifyEnable = true;
149 sba->InstructionBuffersizeModifyEnable = true;
150
151 #if GFX_VER >= 11
152 sba->BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
153 sba->BindlessSamplerStateBufferSize = 0;
154 sba->BindlessSamplerStateMOCS = mocs;
155 sba->BindlessSamplerStateBaseAddressModifyEnable = true;
156 #endif
157
158 sba->DynamicStateBaseAddress = (struct anv_address) {
159 .offset = device->physical->va.dynamic_state_pool.addr,
160 };
161 sba->DynamicStateBufferSize =
162 (device->physical->va.dynamic_state_pool.size +
163 device->physical->va.dynamic_visible_pool.size +
164 device->physical->va.push_descriptor_buffer_pool.size) / 4096;
165 sba->DynamicStateMOCS = mocs;
166 sba->DynamicStateBaseAddressModifyEnable = true;
167 sba->DynamicStateBufferSizeModifyEnable = true;
168
169 if (cmd_buffer->state.pending_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) {
170 #if GFX_VERx10 >= 125
171 sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
172 .offset = device->physical->va.dynamic_visible_pool.addr,
173 };
174 sba->BindlessSurfaceStateSize =
175 (device->physical->va.dynamic_visible_pool.size +
176 device->physical->va.push_descriptor_buffer_pool.size) - 1;
177 sba->BindlessSurfaceStateMOCS = mocs;
178 sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
179 #else
180 const uint64_t surfaces_addr =
181 cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
182 cmd_buffer->state.descriptor_buffers.surfaces_address :
183 anv_address_physical(device->workaround_address);
184 const uint64_t surfaces_size =
185 cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
186 MIN2(device->physical->va.dynamic_visible_pool.size -
187 (cmd_buffer->state.descriptor_buffers.surfaces_address -
188 device->physical->va.dynamic_visible_pool.addr),
189 anv_physical_device_bindless_heap_size(device->physical, true)) :
190 (device->workaround_bo->size - device->workaround_address.offset);
191 sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
192 .offset = surfaces_addr,
193 };
194 sba->BindlessSurfaceStateSize = surfaces_size / ANV_SURFACE_STATE_SIZE - 1;
195 sba->BindlessSurfaceStateMOCS = mocs;
196 sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
197 #endif /* GFX_VERx10 < 125 */
198 } else if (!device->physical->indirect_descriptors) {
199 #if GFX_VERx10 >= 125
200 sba->BindlessSurfaceStateBaseAddress = (struct anv_address) {
201 .offset = device->physical->va.internal_surface_state_pool.addr,
202 };
203 sba->BindlessSurfaceStateSize =
204 (device->physical->va.internal_surface_state_pool.size +
205 device->physical->va.bindless_surface_state_pool.size) - 1;
206 sba->BindlessSurfaceStateMOCS = mocs;
207 sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
208 #else
209 unreachable("Direct descriptor not supported");
210 #endif
211 } else {
212 sba->BindlessSurfaceStateBaseAddress =
213 (struct anv_address) { .offset =
214 device->physical->va.bindless_surface_state_pool.addr,
215 };
216 sba->BindlessSurfaceStateSize =
217 anv_physical_device_bindless_heap_size(device->physical, false) /
218 ANV_SURFACE_STATE_SIZE - 1;
219 sba->BindlessSurfaceStateMOCS = mocs;
220 sba->BindlessSurfaceStateBaseAddressModifyEnable = true;
221 }
222
223 #if GFX_VERx10 >= 125
224 sba->L1CacheControl = L1CC_WB;
225 #endif
226 }
227
228 void
genX(cmd_buffer_emit_state_base_address)229 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
230 {
231 if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
232 anv_cmd_buffer_is_video_queue(cmd_buffer))
233 return;
234
235 struct anv_device *device = cmd_buffer->device;
236
237 struct GENX(STATE_BASE_ADDRESS) sba = {};
238 fill_state_base_addr(cmd_buffer, &sba);
239
240 #if GFX_VERx10 >= 125
241 struct mi_builder b;
242 mi_builder_init(&b, device->info, &cmd_buffer->batch);
243 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
244 struct mi_goto_target t = MI_GOTO_TARGET_INIT;
245 mi_goto_if(&b,
246 mi_ieq(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
247 mi_imm(sba.BindlessSurfaceStateBaseAddress.offset)),
248 &t);
249 #endif
250
251 /* Emit a render target cache flush.
252 *
253 * This isn't documented anywhere in the PRM. However, it seems to be
254 * necessary prior to changing the surface state base address. Without
255 * this, we get GPU hangs when using multi-level command buffers which
256 * clear depth, reset state base address, and then go render stuff.
257 *
258 * Render target cache flush before SBA is required by Wa_18039438632.
259 */
260 genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
261 cmd_buffer->state.current_pipeline,
262 #if GFX_VER >= 12
263 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
264 #else
265 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
266 #endif
267 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
268 ANV_PIPE_CS_STALL_BIT);
269
270 #if INTEL_NEEDS_WA_1607854226
271 /* Wa_1607854226:
272 *
273 * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
274 * mode by putting the pipeline temporarily in 3D mode.
275 */
276 uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
277 genX(flush_pipeline_select_3d)(cmd_buffer);
278 #endif
279
280 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), _sba) {
281 _sba = sba;
282 }
283
284 if (cmd_buffer->state.current_db_mode != cmd_buffer->state.pending_db_mode)
285 cmd_buffer->state.current_db_mode = cmd_buffer->state.pending_db_mode;
286
287 #if INTEL_NEEDS_WA_1607854226
288 /* Wa_1607854226:
289 *
290 * Put the pipeline back into its current mode.
291 */
292 if (gfx12_wa_pipeline != UINT32_MAX)
293 genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
294 #endif
295
296 /* After re-setting the surface state base address, we have to do some
297 * cache flushing so that the sampler engine will pick up the new
298 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
299 * Shared Function > 3D Sampler > State > State Caching (page 96):
300 *
301 * Coherency with system memory in the state cache, like the texture
302 * cache is handled partially by software. It is expected that the
303 * command stream or shader will issue Cache Flush operation or
304 * Cache_Flush sampler message to ensure that the L1 cache remains
305 * coherent with system memory.
306 *
307 * [...]
308 *
309 * Whenever the value of the Dynamic_State_Base_Addr,
310 * Surface_State_Base_Addr are altered, the L1 state cache must be
311 * invalidated to ensure the new surface or sampler state is fetched
312 * from system memory.
313 *
314 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
315 * which, according the PIPE_CONTROL instruction documentation in the
316 * Broadwell PRM:
317 *
318 * Setting this bit is independent of any other bit in this packet.
319 * This bit controls the invalidation of the L1 and L2 state caches
320 * at the top of the pipe i.e. at the parsing time.
321 *
322 * Unfortunately, experimentation seems to indicate that state cache
323 * invalidation through a PIPE_CONTROL does nothing whatsoever in
324 * regards to surface state and binding tables. In stead, it seems that
325 * invalidating the texture cache is what is actually needed.
326 *
327 * XXX: As far as we have been able to determine through
328 * experimentation, shows that flush the texture cache appears to be
329 * sufficient. The theory here is that all of the sampling/rendering
330 * units cache the binding table in the texture cache. However, we have
331 * yet to be able to actually confirm this.
332 *
333 * Wa_14013910100:
334 *
335 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
336 * or program pipe control with Instruction cache invalidate post
337 * STATE_BASE_ADDRESS command"
338 */
339 enum anv_pipe_bits bits =
340 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
341 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
342 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
343 (intel_needs_workaround(device->info, 16013000631) ?
344 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0);
345
346 genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
347 cmd_buffer->state.current_pipeline,
348 bits);
349
350 assert(cmd_buffer->state.current_db_mode !=
351 ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
352
353 #if GFX_VERx10 >= 125
354 assert(sba.BindlessSurfaceStateBaseAddress.offset != 0);
355 mi_store(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
356 mi_imm(sba.BindlessSurfaceStateBaseAddress.offset));
357
358 mi_goto_target(&b, &t);
359 #endif
360
361 #if GFX_VERx10 >= 125
362 genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
363 #endif
364
365 /* If we have emitted a new state base address we probably need to re-emit
366 * binding tables.
367 */
368 cmd_buffer->state.descriptors_dirty |= ~0;
369 }
370
371 void
genX(cmd_buffer_emit_bt_pool_base_address)372 genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer)
373 {
374 if (!anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer))
375 return;
376
377 /* If we are emitting a new state base address we probably need to re-emit
378 * binding tables.
379 */
380 cmd_buffer->state.descriptors_dirty |= ~0;
381
382 #if GFX_VERx10 >= 125
383 struct anv_device *device = cmd_buffer->device;
384 const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
385
386 /* We're changing base location of binding tables which affects the state
387 * cache. We're adding texture cache invalidation following a
388 * recommendation from the ICL PRMs, Volume 9: Render Engine, Coherency
389 * Mechanisms:
390 *
391 * "It is strongly recommended that a Texture cache invalidation be done
392 * whenever a State cache invalidation is done."
393 *
394 * Prior to do the invalidation, we need a CS_STALL to ensure that all work
395 * using surface states has completed.
396 */
397 genx_batch_emit_pipe_control(&cmd_buffer->batch,
398 cmd_buffer->device->info,
399 cmd_buffer->state.current_pipeline,
400 ANV_PIPE_CS_STALL_BIT);
401 anv_batch_emit(
402 &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
403 btpa.BindingTablePoolBaseAddress =
404 anv_cmd_buffer_surface_base_address(cmd_buffer);
405 btpa.BindingTablePoolBufferSize = device->physical->va.binding_table_pool.size / 4096;
406 btpa.MOCS = mocs;
407 }
408 genx_batch_emit_pipe_control(&cmd_buffer->batch,
409 cmd_buffer->device->info,
410 cmd_buffer->state.current_pipeline,
411 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
412 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
413
414 #else /* GFX_VERx10 < 125 */
415 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
416 #endif
417 }
418
419 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)420 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
421 struct anv_address addr)
422 {
423 VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
424 addr.bo);
425
426 if (unlikely(result != VK_SUCCESS))
427 anv_batch_set_error(&cmd_buffer->batch, result);
428 }
429
430 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,const struct anv_surface_state * state)431 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
432 const struct anv_surface_state *state)
433 {
434 assert(!anv_address_is_null(state->address));
435 add_surface_reloc(cmd_buffer, state->address);
436
437 if (!anv_address_is_null(state->aux_address)) {
438 VkResult result =
439 anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
440 state->aux_address.bo);
441 if (result != VK_SUCCESS)
442 anv_batch_set_error(&cmd_buffer->batch, result);
443 }
444
445 if (!anv_address_is_null(state->clear_address)) {
446 VkResult result =
447 anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
448 state->clear_address.bo);
449 if (result != VK_SUCCESS)
450 anv_batch_set_error(&cmd_buffer->batch, result);
451 }
452 }
453
454 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
455 * the initial layout is undefined, the HiZ buffer and depth buffer will
456 * represent the same data at the end of this operation.
457 */
458 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)459 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
460 const struct anv_image *image,
461 uint32_t base_level, uint32_t level_count,
462 uint32_t base_layer, uint32_t layer_count,
463 VkImageLayout initial_layout,
464 VkImageLayout final_layout,
465 bool will_full_fast_clear)
466 {
467 const uint32_t depth_plane =
468 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
469 if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
470 return;
471
472 /* Initialize the indirect clear color prior to first use. */
473 const enum isl_format depth_format =
474 image->planes[depth_plane].primary_surface.isl.format;
475 const struct anv_address clear_color_addr =
476 anv_image_get_clear_color_addr(cmd_buffer->device, image, depth_format,
477 VK_IMAGE_ASPECT_DEPTH_BIT, true);
478 if (!anv_address_is_null(clear_color_addr) &&
479 (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
480 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED)) {
481 const union isl_color_value clear_value =
482 anv_image_hiz_clear_value(image);
483
484 uint32_t depth_value[4] = {};
485 isl_color_value_pack(&clear_value, depth_format, depth_value);
486
487 const uint32_t clear_pixel_offset = clear_color_addr.offset +
488 isl_get_sampler_clear_field_offset(cmd_buffer->device->info,
489 depth_format);
490 const struct anv_address clear_pixel_addr = {
491 .bo = clear_color_addr.bo,
492 .offset = clear_pixel_offset,
493 };
494
495 struct mi_builder b;
496 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
497 mi_builder_set_write_check(&b, true);
498 mi_store(&b, mi_mem32(clear_pixel_addr), mi_imm(depth_value[0]));
499 }
500
501 /* If will_full_fast_clear is set, the caller promises to fast-clear the
502 * largest portion of the specified range as it can.
503 */
504 if (will_full_fast_clear)
505 return;
506
507 const enum isl_aux_state initial_state =
508 anv_layout_to_aux_state(cmd_buffer->device->info, image,
509 VK_IMAGE_ASPECT_DEPTH_BIT,
510 initial_layout,
511 cmd_buffer->queue_family->queueFlags);
512 const enum isl_aux_state final_state =
513 anv_layout_to_aux_state(cmd_buffer->device->info, image,
514 VK_IMAGE_ASPECT_DEPTH_BIT,
515 final_layout,
516 cmd_buffer->queue_family->queueFlags);
517
518 const bool initial_depth_valid =
519 isl_aux_state_has_valid_primary(initial_state);
520 const bool initial_hiz_valid =
521 isl_aux_state_has_valid_aux(initial_state);
522 const bool final_needs_depth =
523 isl_aux_state_has_valid_primary(final_state);
524 const bool final_needs_hiz =
525 isl_aux_state_has_valid_aux(final_state);
526
527 /* Getting into the pass-through state for Depth is tricky and involves
528 * both a resolve and an ambiguate. We don't handle that state right now
529 * as anv_layout_to_aux_state never returns it.
530 */
531 assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
532
533 enum isl_aux_op hiz_op = ISL_AUX_OP_NONE;
534 if (final_needs_depth && !initial_depth_valid) {
535 assert(initial_hiz_valid);
536 hiz_op = ISL_AUX_OP_FULL_RESOLVE;
537 } else if (final_needs_hiz && !initial_hiz_valid) {
538 assert(initial_depth_valid);
539 hiz_op = ISL_AUX_OP_AMBIGUATE;
540 }
541
542 if (hiz_op != ISL_AUX_OP_NONE) {
543 for (uint32_t l = 0; l < level_count; l++) {
544 const uint32_t level = base_level + l;
545
546 uint32_t aux_layers =
547 anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level);
548 if (base_layer >= aux_layers)
549 break; /* We will only get fewer layers as level increases */
550 uint32_t level_layer_count =
551 MIN2(layer_count, aux_layers - base_layer);
552
553 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
554 level, base_layer, level_layer_count, hiz_op);
555 }
556 }
557
558 /* Additional tile cache flush for MTL:
559 *
560 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
561 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
562 */
563 if (intel_device_info_is_mtl(cmd_buffer->device->info) &&
564 image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
565 final_needs_depth && !initial_depth_valid) {
566 anv_add_pending_pipe_bits(cmd_buffer,
567 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
568 "HIZ-CCS flush");
569 }
570 }
571
572 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
573 * the initial layout is undefined, the HiZ buffer and depth buffer will
574 * represent the same data at the end of this operation.
575 */
576 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)577 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
578 const struct anv_image *image,
579 uint32_t base_level, uint32_t level_count,
580 uint32_t base_layer, uint32_t layer_count,
581 VkImageLayout initial_layout,
582 VkImageLayout final_layout,
583 bool will_full_fast_clear)
584 {
585 #if GFX_VER == 12
586 const uint32_t plane =
587 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
588 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
589 return;
590
591 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
592 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
593 cmd_buffer->device->info->has_aux_map) {
594 /* If will_full_fast_clear is set, the caller promises to fast-clear the
595 * largest portion of the specified range as it can.
596 */
597 if (will_full_fast_clear)
598 return;
599
600 for (uint32_t l = 0; l < level_count; l++) {
601 const uint32_t level = base_level + l;
602 const VkRect2D clear_rect = {
603 .offset.x = 0,
604 .offset.y = 0,
605 .extent.width = u_minify(image->vk.extent.width, level),
606 .extent.height = u_minify(image->vk.extent.height, level),
607 };
608
609 uint32_t aux_layers =
610 anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
611
612 if (base_layer >= aux_layers)
613 break; /* We will only get fewer layers as level increases */
614
615 uint32_t level_layer_count =
616 MIN2(layer_count, aux_layers - base_layer);
617
618 /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
619 * Enable:
620 *
621 * "When enabled, Stencil Buffer needs to be initialized via
622 * stencil clear (HZ_OP) before any renderpass."
623 */
624 const VkClearDepthStencilValue clear_value = {};
625 anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
626 level, base_layer, level_layer_count,
627 clear_rect, &clear_value);
628 }
629 }
630
631 /* Additional tile cache flush for MTL:
632 *
633 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
634 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
635 */
636 if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
637 anv_add_pending_pipe_bits(cmd_buffer,
638 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
639 "HIZ-CCS flush");
640 }
641 #endif
642 }
643
644 #define MI_PREDICATE_SRC0 0x2400
645 #define MI_PREDICATE_SRC1 0x2408
646 #define MI_PREDICATE_RESULT 0x2418
647
648 static void
set_image_compressed_bit(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t base_layer,uint32_t layer_count,bool compressed)649 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
650 const struct anv_image *image,
651 VkImageAspectFlagBits aspect,
652 uint32_t level,
653 uint32_t base_layer, uint32_t layer_count,
654 bool compressed)
655 {
656 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
657
658 /* We only have compression tracking for CCS_E */
659 if (!isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage))
660 return;
661
662 struct anv_device *device = cmd_buffer->device;
663 struct mi_builder b;
664 mi_builder_init(&b, device->info, &cmd_buffer->batch);
665 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
666
667 for (uint32_t a = 0; a < layer_count; a++) {
668 uint32_t layer = base_layer + a;
669 struct anv_address comp_state_addr =
670 anv_image_get_compression_state_addr(device,
671 image, aspect,
672 level, layer);
673 mi_store(&b, mi_mem32(comp_state_addr),
674 mi_imm(compressed ? UINT32_MAX : 0));
675 }
676
677 /* FCV_CCS_E images are automatically fast cleared to default value at
678 * render time. In order to account for this, anv should set the the
679 * appropriate fast clear state for level0/layer0.
680 *
681 * At the moment, tracking the fast clear state for higher levels/layers is
682 * neither supported, nor do we enter a situation where it is a concern.
683 */
684 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E &&
685 base_layer == 0 && level == 0) {
686 struct anv_address fc_type_addr =
687 anv_image_get_fast_clear_type_addr(device, image, aspect);
688 mi_store(&b, mi_mem32(fc_type_addr),
689 mi_imm(ANV_FAST_CLEAR_DEFAULT_VALUE));
690 }
691 }
692
693 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)694 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
695 const struct anv_image *image,
696 VkImageAspectFlagBits aspect,
697 enum anv_fast_clear_type fast_clear)
698 {
699 struct anv_device *device = cmd_buffer->device;
700 struct mi_builder b;
701 mi_builder_init(&b, device->info, &cmd_buffer->batch);
702 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
703
704 struct anv_address fc_type_addr =
705 anv_image_get_fast_clear_type_addr(device, image, aspect);
706 mi_store(&b, mi_mem32(fc_type_addr), mi_imm(fast_clear));
707
708 /* Whenever we have fast-clear, we consider that slice to be compressed.
709 * This makes building predicates much easier.
710 */
711 if (fast_clear != ANV_FAST_CLEAR_NONE)
712 set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
713 }
714
715 /* This is only really practical on haswell and above because it requires
716 * MI math in order to get it correct.
717 */
718 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)719 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
720 const struct anv_image *image,
721 VkImageAspectFlagBits aspect,
722 uint32_t level, uint32_t array_layer,
723 enum isl_aux_op resolve_op,
724 enum anv_fast_clear_type fast_clear_supported)
725 {
726 struct anv_device *device = cmd_buffer->device;
727 struct anv_address addr =
728 anv_image_get_fast_clear_type_addr(device, image, aspect);
729 struct mi_builder b;
730 mi_builder_init(&b, device->info, &cmd_buffer->batch);
731 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
732
733 const struct mi_value fast_clear_type = mi_mem32(addr);
734
735 if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
736 /* In this case, we're doing a full resolve which means we want the
737 * resolve to happen if any compression (including fast-clears) is
738 * present.
739 *
740 * In order to simplify the logic a bit, we make the assumption that,
741 * if the first slice has been fast-cleared, it is also marked as
742 * compressed. See also set_image_fast_clear_state.
743 */
744 const struct mi_value compression_state =
745 mi_mem32(anv_image_get_compression_state_addr(device,
746 image, aspect,
747 level, array_layer));
748 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
749 mi_store(&b, compression_state, mi_imm(0));
750
751 if (level == 0 && array_layer == 0) {
752 /* If the predicate is true, we want to write 0 to the fast clear type
753 * and, if it's false, leave it alone. We can do this by writing
754 *
755 * clear_type = clear_type & ~predicate;
756 */
757 struct mi_value new_fast_clear_type =
758 mi_iand(&b, fast_clear_type,
759 mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
760 mi_store(&b, fast_clear_type, new_fast_clear_type);
761 }
762 } else if (level == 0 && array_layer == 0) {
763 /* In this case, we are doing a partial resolve to get rid of fast-clear
764 * colors. We don't care about the compression state but we do care
765 * about how much fast clear is allowed by the final layout.
766 */
767 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
768 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
769
770 /* We need to compute (fast_clear_supported < image->fast_clear) */
771 struct mi_value pred =
772 mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
773 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
774
775 /* If the predicate is true, we want to write 0 to the fast clear type
776 * and, if it's false, leave it alone. We can do this by writing
777 *
778 * clear_type = clear_type & ~predicate;
779 */
780 struct mi_value new_fast_clear_type =
781 mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
782 mi_store(&b, fast_clear_type, new_fast_clear_type);
783 } else {
784 /* In this case, we're trying to do a partial resolve on a slice that
785 * doesn't have clear color. There's nothing to do.
786 */
787 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
788 return;
789 }
790
791 /* Set src1 to 0 and use a != condition */
792 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
793
794 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
795 mip.LoadOperation = LOAD_LOADINV;
796 mip.CombineOperation = COMBINE_SET;
797 mip.CompareOperation = COMPARE_SRCS_EQUAL;
798 }
799 }
800
801 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)802 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
803 const struct anv_image *image,
804 enum isl_format format,
805 struct isl_swizzle swizzle,
806 VkImageAspectFlagBits aspect,
807 uint32_t level, uint32_t array_layer,
808 enum isl_aux_op resolve_op,
809 enum anv_fast_clear_type fast_clear_supported)
810 {
811 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
812
813 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
814 aspect, level, array_layer,
815 resolve_op, fast_clear_supported);
816
817 /* CCS_D only supports full resolves and BLORP will assert on us if we try
818 * to do a partial resolve on a CCS_D surface.
819 */
820 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
821 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
822 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
823
824 anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
825 level, array_layer, 1, resolve_op, NULL, true);
826 }
827
828 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)829 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
830 const struct anv_image *image,
831 enum isl_format format,
832 struct isl_swizzle swizzle,
833 VkImageAspectFlagBits aspect,
834 uint32_t array_layer,
835 enum isl_aux_op resolve_op,
836 enum anv_fast_clear_type fast_clear_supported)
837 {
838 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
839 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
840
841 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
842 aspect, 0, array_layer,
843 resolve_op, fast_clear_supported);
844
845 anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
846 array_layer, 1, resolve_op, NULL, true);
847 }
848
849 void
genX(cmd_buffer_mark_image_written)850 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
851 const struct anv_image *image,
852 VkImageAspectFlagBits aspect,
853 enum isl_aux_usage aux_usage,
854 uint32_t level,
855 uint32_t base_layer,
856 uint32_t layer_count)
857 {
858 #if GFX_VER < 20
859 /* The aspect must be exactly one of the image aspects. */
860 assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
861
862 /* Filter out aux usages that don't have any compression tracking.
863 * Note: We only have compression tracking for CCS_E images, but it's
864 * possible for a CCS_E enabled image to have a subresource with a different
865 * aux usage.
866 */
867 if (!isl_aux_usage_has_compression(aux_usage))
868 return;
869
870 set_image_compressed_bit(cmd_buffer, image, aspect,
871 level, base_layer, layer_count, true);
872 #endif
873 }
874
875 /* Copy the fast-clear value dword(s) between a surface state object and an
876 * image's fast clear state buffer.
877 */
878 void
genX(cmd_buffer_load_clear_color)879 genX(cmd_buffer_load_clear_color)(struct anv_cmd_buffer *cmd_buffer,
880 struct anv_state surface_state,
881 const struct anv_image_view *iview)
882 {
883 #if GFX_VER < 10
884 struct anv_address ss_clear_addr =
885 anv_state_pool_state_address(
886 &cmd_buffer->device->internal_surface_state_pool,
887 (struct anv_state) {
888 .offset = surface_state.offset +
889 cmd_buffer->device->isl_dev.ss.clear_value_offset
890 });
891 const struct anv_address entry_addr =
892 anv_image_get_clear_color_addr(cmd_buffer->device, iview->image,
893 iview->planes[0].isl.format,
894 VK_IMAGE_ASPECT_COLOR_BIT, false);
895
896 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
897
898 struct mi_builder b;
899 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
900 mi_builder_set_write_check(&b, true);
901
902 mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
903
904 /* Updating a surface state object may require that the state cache be
905 * invalidated. From the SKL PRM, Shared Functions -> State -> State
906 * Caching:
907 *
908 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
909 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
910 * modified [...], the L1 state cache must be invalidated to ensure
911 * the new surface or sampler state is fetched from system memory.
912 *
913 * In testing, SKL doesn't actually seem to need this, but HSW does.
914 */
915 anv_add_pending_pipe_bits(cmd_buffer,
916 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
917 "after load_clear_color surface state update");
918 #endif
919 }
920
921 static void
set_image_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,const VkImageAspectFlags aspect,const uint32_t * pixel)922 set_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
923 const struct anv_image *image,
924 const VkImageAspectFlags aspect,
925 const uint32_t *pixel)
926 {
927 for (int i = 0; i < image->num_view_formats; i++) {
928 union isl_color_value clear_color;
929 isl_color_value_unpack(&clear_color, image->view_formats[i], pixel);
930
931 UNUSED union isl_color_value sample_color = clear_color;
932 if (isl_format_is_srgb(image->view_formats[i])) {
933 sample_color.f32[0] =
934 util_format_linear_to_srgb_float(clear_color.f32[0]);
935 sample_color.f32[1] =
936 util_format_linear_to_srgb_float(clear_color.f32[1]);
937 sample_color.f32[2] =
938 util_format_linear_to_srgb_float(clear_color.f32[2]);
939 }
940
941 const struct anv_address addr =
942 anv_image_get_clear_color_addr(cmd_buffer->device, image,
943 image->view_formats[i], aspect,
944 false);
945 assert(!anv_address_is_null(addr));
946
947 #if GFX_VER >= 11
948 assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 32);
949 uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 3 + 6,
950 GENX(MI_STORE_DATA_IMM),
951 .StoreQword = true, .Address = addr);
952 dw[3] = clear_color.u32[0];
953 dw[4] = clear_color.u32[1];
954 dw[5] = clear_color.u32[2];
955 dw[6] = clear_color.u32[3];
956 dw[7] = pixel[0];
957 dw[8] = pixel[1];
958 #else
959 assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 0);
960 assert(cmd_buffer->device->isl_dev.ss.clear_value_size == 16);
961 uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 3 + 8,
962 GENX(MI_STORE_DATA_IMM),
963 .StoreQword = true, .Address = addr);
964 dw[3] = clear_color.u32[0];
965 dw[4] = clear_color.u32[1];
966 dw[5] = clear_color.u32[2];
967 dw[6] = clear_color.u32[3];
968 dw[7] = sample_color.u32[0];
969 dw[8] = sample_color.u32[1];
970 dw[9] = sample_color.u32[2];
971 dw[10] = sample_color.u32[3];
972 #endif
973 }
974 }
975
976 void
genX(set_fast_clear_state)977 genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
978 const struct anv_image *image,
979 const enum isl_format format,
980 const struct isl_swizzle swizzle,
981 union isl_color_value clear_color)
982 {
983 uint32_t pixel[4] = {};
984 union isl_color_value swiz_color =
985 isl_color_value_swizzle_inv(clear_color, swizzle);
986 isl_color_value_pack(&swiz_color, format, pixel);
987 set_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, pixel);
988
989 if (isl_color_value_is_zero(clear_color, format)) {
990 /* This image has the auxiliary buffer enabled. We can mark the
991 * subresource as not needing a resolve because the clear color
992 * will match what's in every RENDER_SURFACE_STATE object when
993 * it's being used for sampling.
994 */
995 set_image_fast_clear_state(cmd_buffer, image,
996 VK_IMAGE_ASPECT_COLOR_BIT,
997 ANV_FAST_CLEAR_DEFAULT_VALUE);
998 } else {
999 set_image_fast_clear_state(cmd_buffer, image,
1000 VK_IMAGE_ASPECT_COLOR_BIT,
1001 ANV_FAST_CLEAR_ANY);
1002 }
1003 }
1004
1005 /**
1006 * @brief Transitions a color buffer from one layout to another.
1007 *
1008 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
1009 * more information.
1010 *
1011 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
1012 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
1013 * this represents the maximum layers to transition at each
1014 * specified miplevel.
1015 */
1016 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)1017 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
1018 const struct anv_image *image,
1019 VkImageAspectFlagBits aspect,
1020 const uint32_t base_level, uint32_t level_count,
1021 uint32_t base_layer, uint32_t layer_count,
1022 VkImageLayout initial_layout,
1023 VkImageLayout final_layout,
1024 uint32_t src_queue_family,
1025 uint32_t dst_queue_family,
1026 bool will_full_fast_clear)
1027 {
1028 struct anv_device *device = cmd_buffer->device;
1029 const struct intel_device_info *devinfo = device->info;
1030 /* Validate the inputs. */
1031 assert(cmd_buffer);
1032 assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
1033 /* These values aren't supported for simplicity's sake. */
1034 assert(level_count != VK_REMAINING_MIP_LEVELS &&
1035 layer_count != VK_REMAINING_ARRAY_LAYERS);
1036 /* Ensure the subresource range is valid. */
1037 UNUSED uint64_t last_level_num = base_level + level_count;
1038 const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
1039 UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
1040 assert((uint64_t)base_layer + layer_count <= image_layers);
1041 assert(last_level_num <= image->vk.mip_levels);
1042 /* If there is a layout transfer, the final layout cannot be undefined or
1043 * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
1044 */
1045 assert(initial_layout == final_layout ||
1046 (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
1047 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
1048 const struct isl_drm_modifier_info *isl_mod_info =
1049 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
1050 ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
1051 : NULL;
1052
1053 const bool src_queue_external =
1054 src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1055 src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1056
1057 const bool dst_queue_external =
1058 dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
1059 dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
1060
1061 /* If the queues are external, consider the first queue family flags
1062 * (should be the most capable)
1063 */
1064 const VkQueueFlagBits src_queue_flags =
1065 device->physical->queue.families[
1066 (src_queue_external || src_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
1067 0 : src_queue_family].queueFlags;
1068 const VkQueueFlagBits dst_queue_flags =
1069 device->physical->queue.families[
1070 (dst_queue_external || dst_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
1071 0 : dst_queue_family].queueFlags;
1072
1073 /* Simultaneous acquire and release on external queues is illegal. */
1074 assert(!src_queue_external || !dst_queue_external);
1075
1076 /* Ownership transition on an external queue requires special action if the
1077 * image has a DRM format modifier because we store image data in
1078 * a driver-private bo which is inaccessible to the external queue.
1079 */
1080 const bool private_binding_acquire =
1081 src_queue_external &&
1082 anv_image_is_externally_shared(image) &&
1083 anv_image_has_private_binding(image);
1084
1085 const bool private_binding_release =
1086 dst_queue_external &&
1087 anv_image_is_externally_shared(image) &&
1088 anv_image_has_private_binding(image);
1089
1090 if (initial_layout == final_layout &&
1091 !private_binding_acquire && !private_binding_release) {
1092 /* No work is needed. */
1093 return;
1094 }
1095
1096 /**
1097 * Section 7.7.4 of the Vulkan 1.3.260 spec says:
1098 *
1099 * If the transfer is via an image memory barrier, and an image layout
1100 * transition is desired, then the values of oldLayout and newLayout in the
1101 * release operation's memory barrier must be equal to values of oldLayout
1102 * and newLayout in the acquire operation's memory barrier. Although the
1103 * image layout transition is submitted twice, it will only be executed
1104 * once. A layout transition specified in this way happens-after the
1105 * release operation and happens-before the acquire operation.
1106 *
1107 * Because we know that we get match transition on each queue, we choose to
1108 * only do the work on one queue type : RENDER. In the cases where we do
1109 * transitions between COMPUTE & TRANSFER, we should have matching
1110 * aux/fast_clear value which would trigger no work in the code below.
1111 */
1112 if (!(src_queue_external || dst_queue_external) &&
1113 src_queue_family != VK_QUEUE_FAMILY_IGNORED &&
1114 dst_queue_family != VK_QUEUE_FAMILY_IGNORED &&
1115 src_queue_family != dst_queue_family) {
1116 enum intel_engine_class src_engine =
1117 cmd_buffer->queue_family->engine_class;
1118 if (src_engine != INTEL_ENGINE_CLASS_RENDER)
1119 return;
1120 }
1121
1122 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
1123
1124 if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
1125 return;
1126
1127 enum isl_aux_usage initial_aux_usage =
1128 anv_layout_to_aux_usage(devinfo, image, aspect, 0,
1129 initial_layout, src_queue_flags);
1130 enum isl_aux_usage final_aux_usage =
1131 anv_layout_to_aux_usage(devinfo, image, aspect, 0,
1132 final_layout, dst_queue_flags);
1133 enum anv_fast_clear_type initial_fast_clear =
1134 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout,
1135 src_queue_flags);
1136 enum anv_fast_clear_type final_fast_clear =
1137 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout,
1138 dst_queue_flags);
1139
1140 /* We must override the anv_layout_to_* functions because they are unaware
1141 * of acquire/release direction.
1142 */
1143 if (private_binding_acquire) {
1144 initial_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
1145 image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
1146 initial_fast_clear = isl_mod_info->supports_clear_color ?
1147 initial_fast_clear : ANV_FAST_CLEAR_NONE;
1148 } else if (private_binding_release) {
1149 final_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
1150 image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
1151 final_fast_clear = isl_mod_info->supports_clear_color ?
1152 final_fast_clear : ANV_FAST_CLEAR_NONE;
1153 }
1154
1155 assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
1156
1157 /* The following layouts are equivalent for non-linear images. */
1158 const bool initial_layout_undefined =
1159 initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
1160 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
1161
1162 bool must_init_fast_clear_state = false;
1163 bool must_init_aux_surface = false;
1164
1165 if (initial_layout_undefined) {
1166 /* The subresource may have been aliased and populated with arbitrary
1167 * data, so we should initialize fast-clear state on platforms prior to
1168 * Xe2. Xe2+ platforms don't need it thanks to the new design of fast-
1169 * clear.
1170 */
1171 must_init_fast_clear_state = devinfo->ver < 20;
1172
1173 if (isl_aux_usage_has_mcs(image->planes[plane].aux_usage) ||
1174 devinfo->has_illegal_ccs_values) {
1175
1176 must_init_aux_surface = true;
1177
1178 } else {
1179 assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
1180
1181 /* We can start using the CCS immediately without ambiguating. The
1182 * two conditions that enable this are:
1183 *
1184 * 1) The device treats all possible CCS values as legal. In other
1185 * words, we can't confuse the hardware with random bits in the
1186 * CCS.
1187 *
1188 * 2) We enable compression on all writable image layouts. The CCS
1189 * will receive all writes and will therefore always be in sync
1190 * with the main surface.
1191 *
1192 * If we were to disable compression on some writable layouts, the
1193 * CCS could get out of sync with the main surface and the app
1194 * could lose the data it wrote previously. For example, this
1195 * could happen if an app: transitions from UNDEFINED w/o
1196 * ambiguating -> renders with AUX_NONE -> samples with AUX_CCS.
1197 *
1198 * The second condition is asserted below, but could be moved
1199 * elsewhere for more coverage (we're only checking transitions from
1200 * an undefined layout).
1201 */
1202 assert(vk_image_layout_is_read_only(final_layout, aspect) ||
1203 (final_aux_usage != ISL_AUX_USAGE_NONE));
1204
1205 must_init_aux_surface = false;
1206 }
1207
1208 } else if (private_binding_acquire) {
1209 /* The fast clear state lives in a driver-private bo, and therefore the
1210 * external/foreign queue is unaware of it.
1211 *
1212 * If this is the first time we are accessing the image, then the fast
1213 * clear state is uninitialized.
1214 *
1215 * If this is NOT the first time we are accessing the image, then the fast
1216 * clear state may still be valid and correct due to the resolve during
1217 * our most recent ownership release. However, we do not track the aux
1218 * state with MI stores, and therefore must assume the worst-case: that
1219 * this is the first time we are accessing the image.
1220 */
1221 assert(image->planes[plane].fast_clear_memory_range.binding ==
1222 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1223 must_init_fast_clear_state = true;
1224
1225 if (anv_image_get_aux_memory_range(image, plane)->binding ==
1226 ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1227 /* The aux surface, like the fast clear state, lives in
1228 * a driver-private bo. We must initialize the aux surface for the
1229 * same reasons we must initialize the fast clear state.
1230 */
1231 must_init_aux_surface = true;
1232 } else {
1233 /* The aux surface, unlike the fast clear state, lives in
1234 * application-visible VkDeviceMemory and is shared with the
1235 * external/foreign queue. Therefore, when we acquire ownership of the
1236 * image with a defined VkImageLayout, the aux surface is valid and has
1237 * the aux state required by the modifier.
1238 */
1239 must_init_aux_surface = false;
1240 }
1241 }
1242
1243 if (must_init_fast_clear_state) {
1244 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
1245 /* Ensure the raw and converted clear colors are in sync. */
1246 const uint32_t zero_pixel[4] = {};
1247 set_image_clear_color(cmd_buffer, image, aspect, zero_pixel);
1248 }
1249 if (base_level == 0 && base_layer == 0) {
1250 set_image_fast_clear_state(cmd_buffer, image, aspect,
1251 ANV_FAST_CLEAR_NONE);
1252 }
1253 }
1254
1255 if (must_init_aux_surface) {
1256 assert(devinfo->ver >= 20 || must_init_fast_clear_state);
1257
1258 /* Initialize the aux buffers to enable correct rendering. In order to
1259 * ensure that things such as storage images work correctly, aux buffers
1260 * need to be initialized to valid data.
1261 *
1262 * Having an aux buffer with invalid data is a problem for two reasons:
1263 *
1264 * 1) Having an invalid value in the buffer can confuse the hardware.
1265 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1266 * invalid and leads to the hardware doing strange things. It
1267 * doesn't hang as far as we can tell but rendering corruption can
1268 * occur.
1269 *
1270 * 2) If this transition is into the GENERAL layout and we then use the
1271 * image as a storage image, then we must have the aux buffer in the
1272 * pass-through state so that, if we then go to texture from the
1273 * image, we get the results of our storage image writes and not the
1274 * fast clear color or other random data.
1275 *
1276 * For CCS both of the problems above are real demonstrable issues. In
1277 * that case, the only thing we can do is to perform an ambiguate to
1278 * transition the aux surface into the pass-through state.
1279 *
1280 * For MCS, (2) is never an issue because we don't support multisampled
1281 * storage images. In theory, issue (1) is a problem with MCS but we've
1282 * never seen it in the wild. For 4x and 16x, all bit patterns could,
1283 * in theory, be interpreted as something but we don't know that all bit
1284 * patterns are actually valid. For 2x and 8x, you could easily end up
1285 * with the MCS referring to an invalid plane because not all bits of
1286 * the MCS value are actually used. Even though we've never seen issues
1287 * in the wild, it's best to play it safe and initialize the MCS. We
1288 * could use a fast-clear for MCS because we only ever touch from render
1289 * and texture (no image load store). However, due to WA 14013111325,
1290 * we choose to ambiguate MCS as well.
1291 */
1292 if (image->vk.samples == 1) {
1293 for (uint32_t l = 0; l < level_count; l++) {
1294 const uint32_t level = base_level + l;
1295
1296 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1297 if (base_layer >= aux_layers)
1298 break; /* We will only get fewer layers as level increases */
1299 uint32_t level_layer_count =
1300 MIN2(layer_count, aux_layers - base_layer);
1301
1302 /* If will_full_fast_clear is set, the caller promises to
1303 * fast-clear the largest portion of the specified range as it can.
1304 * For color images, that means only the first LOD and array slice.
1305 */
1306 if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1307 base_layer++;
1308 level_layer_count--;
1309 if (level_layer_count == 0)
1310 continue;
1311 }
1312
1313 anv_image_ccs_op(cmd_buffer, image,
1314 image->planes[plane].primary_surface.isl.format,
1315 ISL_SWIZZLE_IDENTITY,
1316 aspect, level, base_layer, level_layer_count,
1317 ISL_AUX_OP_AMBIGUATE, NULL, false);
1318
1319 set_image_compressed_bit(cmd_buffer, image, aspect, level,
1320 base_layer, level_layer_count, false);
1321 }
1322 } else {
1323 /* If will_full_fast_clear is set, the caller promises to fast-clear
1324 * the largest portion of the specified range as it can.
1325 */
1326 if (will_full_fast_clear)
1327 return;
1328
1329 assert(base_level == 0 && level_count == 1);
1330 anv_image_mcs_op(cmd_buffer, image,
1331 image->planes[plane].primary_surface.isl.format,
1332 ISL_SWIZZLE_IDENTITY,
1333 aspect, base_layer, layer_count,
1334 ISL_AUX_OP_AMBIGUATE, NULL, false);
1335 }
1336 return;
1337 }
1338
1339 /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1340 * We can handle transitions between CCS_D/E to and from NONE. What we
1341 * don't yet handle is switching between CCS_E and CCS_D within a given
1342 * image. Doing so in a performant way requires more detailed aux state
1343 * tracking such as what is done in i965. For now, just assume that we
1344 * only have one type of compression.
1345 */
1346 assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1347 final_aux_usage == ISL_AUX_USAGE_NONE ||
1348 initial_aux_usage == final_aux_usage);
1349
1350 /* If initial aux usage is NONE, there is nothing to resolve */
1351 if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1352 return;
1353
1354 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1355
1356 /* If the initial layout supports more fast clear than the final layout
1357 * then we need at least a partial resolve.
1358 */
1359 if (final_fast_clear < initial_fast_clear) {
1360 /* Partial resolves will actually only occur on layer 0/level 0. This
1361 * is generally okay because anv only allows explicit fast clears to
1362 * the first subresource.
1363 *
1364 * The situation is a bit different with FCV_CCS_E. With that aux
1365 * usage, implicit fast clears can occur on any layer and level.
1366 * anv doesn't track fast clear states for more than the first
1367 * subresource, so we need to assert that a layout transition doesn't
1368 * attempt to partial resolve the other subresources.
1369 *
1370 * At the moment, we don't enter such a situation, and partial resolves
1371 * for higher level/layer resources shouldn't be a concern.
1372 */
1373 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
1374 assert(base_level == 0 && level_count == 1 &&
1375 base_layer == 0 && layer_count == 1);
1376 }
1377 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1378 }
1379
1380 if (isl_aux_usage_has_ccs_e(initial_aux_usage) &&
1381 !isl_aux_usage_has_ccs_e(final_aux_usage))
1382 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1383
1384 if (resolve_op == ISL_AUX_OP_NONE)
1385 return;
1386
1387 for (uint32_t l = 0; l < level_count; l++) {
1388 uint32_t level = base_level + l;
1389
1390 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1391 if (base_layer >= aux_layers)
1392 break; /* We will only get fewer layers as level increases */
1393 uint32_t level_layer_count =
1394 MIN2(layer_count, aux_layers - base_layer);
1395
1396 for (uint32_t a = 0; a < level_layer_count; a++) {
1397 uint32_t array_layer = base_layer + a;
1398
1399 /* If will_full_fast_clear is set, the caller promises to fast-clear
1400 * the largest portion of the specified range as it can. For color
1401 * images, that means only the first LOD and array slice.
1402 */
1403 if (level == 0 && array_layer == 0 && will_full_fast_clear)
1404 continue;
1405
1406 if (image->vk.samples == 1) {
1407 anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1408 image->planes[plane].primary_surface.isl.format,
1409 ISL_SWIZZLE_IDENTITY,
1410 aspect, level, array_layer, resolve_op,
1411 final_fast_clear);
1412 } else {
1413 /* We only support fast-clear on the first layer so partial
1414 * resolves should not be used on other layers as they will use
1415 * the clear color stored in memory that is only valid for layer0.
1416 */
1417 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1418 array_layer != 0)
1419 continue;
1420
1421 anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1422 image->planes[plane].primary_surface.isl.format,
1423 ISL_SWIZZLE_IDENTITY,
1424 aspect, array_layer, resolve_op,
1425 final_fast_clear);
1426 }
1427 }
1428 }
1429 }
1430
1431 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1432 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1433 uint32_t color_att_count)
1434 {
1435 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1436
1437 /* Reserve one for the NULL state. */
1438 unsigned num_states = 1 + color_att_count;
1439 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1440 const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1441 gfx->att_states =
1442 anv_cmd_buffer_alloc_surface_states(cmd_buffer, num_states);
1443 if (gfx->att_states.map == NULL)
1444 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1445
1446 struct anv_state next_state = gfx->att_states;
1447 next_state.alloc_size = isl_dev->ss.size;
1448
1449 gfx->null_surface_state = next_state;
1450 next_state.offset += ss_stride;
1451 next_state.map += ss_stride;
1452
1453 gfx->color_att_count = color_att_count;
1454 for (uint32_t i = 0; i < color_att_count; i++) {
1455 gfx->color_att[i] = (struct anv_attachment) {
1456 .surface_state.state = next_state,
1457 };
1458 next_state.offset += ss_stride;
1459 next_state.map += ss_stride;
1460 }
1461 gfx->depth_att = (struct anv_attachment) { };
1462 gfx->stencil_att = (struct anv_attachment) { };
1463
1464 return VK_SUCCESS;
1465 }
1466
1467 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1468 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1469 {
1470 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1471
1472 gfx->render_area = (VkRect2D) { };
1473 gfx->layer_count = 0;
1474 gfx->samples = 0;
1475
1476 gfx->color_att_count = 0;
1477 gfx->depth_att = (struct anv_attachment) { };
1478 gfx->stencil_att = (struct anv_attachment) { };
1479 gfx->null_surface_state = ANV_STATE_NULL;
1480 }
1481
1482 /**
1483 * Program the hardware to use the specified L3 configuration.
1484 */
1485 void
genX(cmd_buffer_config_l3)1486 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1487 const struct intel_l3_config *cfg)
1488 {
1489 assert(cfg || GFX_VER >= 12);
1490 if (cfg == cmd_buffer->state.current_l3_config)
1491 return;
1492
1493 #if GFX_VER >= 11
1494 /* On Gfx11+ we use only one config, so verify it remains the same and skip
1495 * the stalling programming entirely.
1496 */
1497 assert(cfg == cmd_buffer->device->l3_config);
1498 #else
1499 if (INTEL_DEBUG(DEBUG_L3)) {
1500 mesa_logd("L3 config transition: ");
1501 intel_dump_l3_config(cfg, stderr);
1502 }
1503
1504 /* According to the hardware docs, the L3 partitioning can only be changed
1505 * while the pipeline is completely drained and the caches are flushed,
1506 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1507 */
1508 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1509 cmd_buffer->state.current_pipeline,
1510 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1511 ANV_PIPE_CS_STALL_BIT);
1512
1513 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1514 * invalidation of the relevant caches. Note that because RO invalidation
1515 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1516 * command is processed by the CS) we cannot combine it with the previous
1517 * stalling flush as the hardware documentation suggests, because that
1518 * would cause the CS to stall on previous rendering *after* RO
1519 * invalidation and wouldn't prevent the RO caches from being polluted by
1520 * concurrent rendering before the stall completes. This intentionally
1521 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1522 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1523 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1524 * already guarantee that there is no concurrent GPGPU kernel execution
1525 * (see SKL HSD 2132585).
1526 */
1527 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1528 cmd_buffer->state.current_pipeline,
1529 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1530 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1531 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1532 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
1533
1534 /* Now send a third stalling flush to make sure that invalidation is
1535 * complete when the L3 configuration registers are modified.
1536 */
1537 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1538 cmd_buffer->state.current_pipeline,
1539 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1540 ANV_PIPE_CS_STALL_BIT);
1541
1542 genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1543 #endif /* GFX_VER >= 11 */
1544 cmd_buffer->state.current_l3_config = cfg;
1545 }
1546
1547 ALWAYS_INLINE void
genX(invalidate_aux_map)1548 genX(invalidate_aux_map)(struct anv_batch *batch,
1549 struct anv_device *device,
1550 enum intel_engine_class engine_class,
1551 enum anv_pipe_bits bits)
1552 {
1553 #if GFX_VER == 12
1554 if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) {
1555 uint32_t register_addr = 0;
1556 switch (engine_class) {
1557 case INTEL_ENGINE_CLASS_COMPUTE:
1558 register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
1559 break;
1560 case INTEL_ENGINE_CLASS_COPY:
1561 #if GFX_VERx10 >= 125
1562 register_addr = GENX(BCS_CCS_AUX_INV_num);
1563 #endif
1564 break;
1565 case INTEL_ENGINE_CLASS_VIDEO:
1566 register_addr = GENX(VD0_CCS_AUX_INV_num);
1567 break;
1568 case INTEL_ENGINE_CLASS_RENDER:
1569 default:
1570 register_addr = GENX(GFX_CCS_AUX_INV_num);
1571 break;
1572 }
1573
1574 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
1575 lri.RegisterOffset = register_addr;
1576 lri.DataDWord = 1;
1577 }
1578
1579 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
1580 if (intel_needs_workaround(device->info, 16018063123) &&
1581 engine_class == INTEL_ENGINE_CLASS_COPY) {
1582 genX(batch_emit_fast_color_dummy_blit)(batch, device);
1583 }
1584
1585 /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1586 *
1587 * "Poll Aux Invalidation bit once the invalidation is set
1588 * (Register 4208 bit 0)"
1589 */
1590 anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
1591 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
1592 sem.WaitMode = PollingMode;
1593 sem.RegisterPollMode = true;
1594 sem.SemaphoreDataDword = 0x0;
1595 sem.SemaphoreAddress =
1596 anv_address_from_u64(register_addr);
1597 }
1598 }
1599 #else
1600 assert(!device->info->has_aux_map);
1601 #endif
1602 }
1603
1604 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1605 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1606 struct anv_device *device,
1607 uint32_t current_pipeline,
1608 enum anv_pipe_bits bits,
1609 enum anv_pipe_bits *emitted_flush_bits)
1610 {
1611 #if GFX_VER >= 12
1612 /* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
1613 *
1614 * "SW must follow below programming restrictions when programming
1615 * PIPE_CONTROL command [for ComputeCS]:
1616 * ...
1617 * Following bits must not be set when programmed for ComputeCS:
1618 * - "Render Target Cache Flush Enable", "Depth Cache Flush Enable"
1619 * and "Tile Cache Flush Enable"
1620 * - "Depth Stall Enable", Stall at Pixel Scoreboard and
1621 * "PSD Sync Enable".
1622 * - "OVR Tile 0 Flush", "TBIMR Force Batch Closure",
1623 * "AMFS Flush Enable", "VF Cache Invalidation Enable" and
1624 * "Global Snapshot Count Reset"."
1625 *
1626 * XXX: According to spec this should not be a concern for a regular
1627 * RCS in GPGPU mode, but during testing it was found that at least
1628 * "VF Cache Invalidation Enable" bit is ignored in such case.
1629 * This can cause us to miss some important invalidations
1630 * (e.g. from CmdPipelineBarriers) and have incoherent data.
1631 *
1632 * There is also a Wa_1606932921 "RCS is not waking up fixed function clock
1633 * when specific 3d related bits are programmed in pipecontrol in
1634 * compute mode" that suggests us not to use "RT Cache Flush" in GPGPU mode.
1635 *
1636 * The other bits are not confirmed to cause problems, but included here
1637 * just to be safe, as they're also not really relevant in the GPGPU mode,
1638 * and having them doesn't seem to cause any regressions.
1639 *
1640 * So if we're currently in GPGPU mode, we hide some bits from
1641 * this flush, and will flush them only when we'll be able to.
1642 * Similar thing with GPGPU-only bits.
1643 */
1644 enum anv_pipe_bits defer_bits = bits &
1645 (current_pipeline == GPGPU ? ANV_PIPE_GFX_BITS: ANV_PIPE_GPGPU_BITS);
1646
1647 bits &= ~defer_bits;
1648 #endif
1649
1650 /*
1651 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1652 *
1653 * Write synchronization is a special case of end-of-pipe
1654 * synchronization that requires that the render cache and/or depth
1655 * related caches are flushed to memory, where the data will become
1656 * globally visible. This type of synchronization is required prior to
1657 * SW (CPU) actually reading the result data from memory, or initiating
1658 * an operation that will use as a read surface (such as a texture
1659 * surface) a previous render target and/or depth/stencil buffer
1660 *
1661 *
1662 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1663 *
1664 * Exercising the write cache flush bits (Render Target Cache Flush
1665 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1666 * ensures the write caches are flushed and doesn't guarantee the data
1667 * is globally visible.
1668 *
1669 * SW can track the completion of the end-of-pipe-synchronization by
1670 * using "Notify Enable" and "PostSync Operation - Write Immediate
1671 * Data" in the PIPE_CONTROL command.
1672 *
1673 * In other words, flushes are pipelined while invalidations are handled
1674 * immediately. Therefore, if we're flushing anything then we need to
1675 * schedule an end-of-pipe sync before any invalidations can happen.
1676 */
1677 if (bits & ANV_PIPE_FLUSH_BITS)
1678 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1679
1680 /* From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
1681 * RCS engine idle sequence:
1682 *
1683 * Gfx12+:
1684 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
1685 * Target Cache Flush + Depth Cache
1686 *
1687 * Gfx125+:
1688 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
1689 * Target Cache Flush + Depth Cache + CCS flush
1690 *
1691 * Compute engine idle sequence:
1692 *
1693 * Gfx12+:
1694 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall
1695 *
1696 * Gfx125+:
1697 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + CCS flush
1698 */
1699 if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) {
1700 if (current_pipeline == GPGPU) {
1701 bits |= (ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1702 ANV_PIPE_L3_FABRIC_FLUSH_BIT |
1703 ANV_PIPE_CS_STALL_BIT |
1704 (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1705 } else if (current_pipeline == _3D) {
1706 bits |= (ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1707 ANV_PIPE_L3_FABRIC_FLUSH_BIT |
1708 ANV_PIPE_CS_STALL_BIT |
1709 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1710 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
1711 (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1712 }
1713 }
1714
1715 /* If we're going to do an invalidate and we have a pending end-of-pipe
1716 * sync that has yet to be resolved, we do the end-of-pipe sync now.
1717 */
1718 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1719 (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1720 bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1721 bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1722
1723 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
1724 fputs("acc: add ", stdout);
1725 anv_dump_pipe_bits(ANV_PIPE_END_OF_PIPE_SYNC_BIT, stdout);
1726 fprintf(stdout, "reason: Ensure flushes done before invalidate\n");
1727 }
1728 }
1729
1730 /* Project: SKL / Argument: LRI Post Sync Operation [23]
1731 *
1732 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1733 * programmed prior to programming a PIPECONTROL command with "LRI
1734 * Post Sync Operation" in GPGPU mode of operation (i.e when
1735 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
1736 *
1737 * The same text exists a few rows below for Post Sync Op.
1738 */
1739 if (bits & ANV_PIPE_POST_SYNC_BIT) {
1740 if (GFX_VER == 9 && current_pipeline == GPGPU)
1741 bits |= ANV_PIPE_CS_STALL_BIT;
1742 bits &= ~ANV_PIPE_POST_SYNC_BIT;
1743 }
1744
1745 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1746 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1747 enum anv_pipe_bits flush_bits =
1748 bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1749 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1750
1751 uint32_t sync_op = NoWrite;
1752 struct anv_address addr = ANV_NULL_ADDRESS;
1753
1754 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1755 *
1756 * "The most common action to perform upon reaching a
1757 * synchronization point is to write a value out to memory. An
1758 * immediate value (included with the synchronization command) may
1759 * be written."
1760 *
1761 *
1762 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1763 *
1764 * "In case the data flushed out by the render engine is to be
1765 * read back in to the render engine in coherent manner, then the
1766 * render engine has to wait for the fence completion before
1767 * accessing the flushed data. This can be achieved by following
1768 * means on various products: PIPE_CONTROL command with CS Stall
1769 * and the required write caches flushed with Post-Sync-Operation
1770 * as Write Immediate Data.
1771 *
1772 * Example:
1773 * - Workload-1 (3D/GPGPU/MEDIA)
1774 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1775 * Immediate Data, Required Write Cache Flush bits set)
1776 * - Workload-2 (Can use the data produce or output by
1777 * Workload-1)
1778 */
1779 if (flush_bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1780 flush_bits |= ANV_PIPE_CS_STALL_BIT;
1781 sync_op = WriteImmediateData;
1782 addr = device->workaround_address;
1783 }
1784
1785 /* Flush PC. */
1786 genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1787 sync_op, addr, 0, flush_bits);
1788
1789 /* If the caller wants to know what flushes have been emitted,
1790 * provide the bits based off the PIPE_CONTROL programmed bits.
1791 */
1792 if (emitted_flush_bits != NULL)
1793 *emitted_flush_bits = flush_bits;
1794
1795 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1796 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1797 }
1798
1799 if (bits & ANV_PIPE_INVALIDATE_BITS) {
1800 uint32_t sync_op = NoWrite;
1801 struct anv_address addr = ANV_NULL_ADDRESS;
1802
1803 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1804 *
1805 * "When VF Cache Invalidate is set “Post Sync Operation” must be
1806 * enabled to “Write Immediate Data” or “Write PS Depth Count” or
1807 * “Write Timestamp”.
1808 */
1809 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1810 sync_op = WriteImmediateData;
1811 addr = device->workaround_address;
1812 }
1813
1814 /* Invalidate PC. */
1815 genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1816 sync_op, addr, 0, bits);
1817
1818 enum intel_engine_class engine_class =
1819 current_pipeline == GPGPU ? INTEL_ENGINE_CLASS_COMPUTE :
1820 INTEL_ENGINE_CLASS_RENDER;
1821 genX(invalidate_aux_map)(batch, device, engine_class, bits);
1822
1823 bits &= ~ANV_PIPE_INVALIDATE_BITS;
1824 }
1825
1826 #if GFX_VER >= 12
1827 bits |= defer_bits;
1828 #endif
1829
1830 return bits;
1831 }
1832
1833 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1834 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1835 {
1836 #if INTEL_NEEDS_WA_1508744258
1837 /* If we're changing the state of the RHWO optimization, we need to have
1838 * sb_stall+cs_stall.
1839 */
1840 const bool rhwo_opt_change =
1841 cmd_buffer->state.rhwo_optimization_enabled !=
1842 cmd_buffer->state.pending_rhwo_optimization_enabled;
1843 if (rhwo_opt_change) {
1844 anv_add_pending_pipe_bits(cmd_buffer,
1845 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
1846 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1847 "change RHWO optimization");
1848 }
1849 #endif
1850
1851 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1852
1853 if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1854 bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1855 else if (bits == 0)
1856 return;
1857
1858 if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
1859 anv_cmd_buffer_is_video_queue(cmd_buffer)) {
1860 if (bits & ANV_PIPE_INVALIDATE_BITS) {
1861 genX(invalidate_aux_map)(&cmd_buffer->batch, cmd_buffer->device,
1862 cmd_buffer->queue_family->engine_class, bits);
1863 bits &= ~ANV_PIPE_INVALIDATE_BITS;
1864 }
1865 cmd_buffer->state.pending_pipe_bits = bits;
1866 return;
1867 }
1868
1869 if (GFX_VER == 9 &&
1870 (bits & ANV_PIPE_CS_STALL_BIT) &&
1871 (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1872 /* If we are doing a VF cache invalidate AND a CS stall (it must be
1873 * both) then we can reset our vertex cache tracking.
1874 */
1875 memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1876 sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1877 memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1878 sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1879 }
1880
1881 enum anv_pipe_bits emitted_bits = 0;
1882 cmd_buffer->state.pending_pipe_bits =
1883 genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1884 cmd_buffer->device,
1885 cmd_buffer->state.current_pipeline,
1886 bits,
1887 &emitted_bits);
1888 anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits);
1889
1890 #if INTEL_NEEDS_WA_1508744258
1891 if (rhwo_opt_change) {
1892 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1893 c1.RCCRHWOOptimizationDisable =
1894 !cmd_buffer->state.pending_rhwo_optimization_enabled;
1895 c1.RCCRHWOOptimizationDisableMask = true;
1896 }
1897 cmd_buffer->state.rhwo_optimization_enabled =
1898 cmd_buffer->state.pending_rhwo_optimization_enabled;
1899 }
1900 #endif
1901
1902 }
1903
1904 static inline struct anv_state
emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1905 emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1906 struct anv_cmd_pipeline_state *pipe_state,
1907 struct anv_pipeline_binding *binding,
1908 const struct anv_descriptor *desc)
1909 {
1910 if (!desc->buffer)
1911 return anv_null_surface_state_for_binding_table(cmd_buffer->device);
1912
1913 /* Compute the offset within the buffer */
1914 uint32_t dynamic_offset =
1915 pipe_state->dynamic_offsets[
1916 binding->set].offsets[binding->dynamic_offset_index];
1917 uint64_t offset = desc->offset + dynamic_offset;
1918 /* Clamp to the buffer size */
1919 offset = MIN2(offset, desc->buffer->vk.size);
1920 /* Clamp the range to the buffer size */
1921 uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
1922
1923 /* Align the range to the reported bounds checking alignment
1924 * VkPhysicalDeviceRobustness2PropertiesEXT::robustUniformBufferAccessSizeAlignment
1925 */
1926 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
1927 range = align(range, ANV_UBO_ALIGNMENT);
1928
1929 struct anv_address address =
1930 anv_address_add(desc->buffer->address, offset);
1931
1932 struct anv_state surface_state =
1933 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
1934 if (surface_state.map == NULL)
1935 return ANV_STATE_NULL;
1936
1937 enum isl_format format =
1938 anv_isl_format_for_descriptor_type(cmd_buffer->device,
1939 desc->type);
1940
1941 isl_surf_usage_flags_t usage =
1942 anv_isl_usage_for_descriptor_type(desc->type);
1943
1944 anv_fill_buffer_surface_state(cmd_buffer->device,
1945 surface_state.map,
1946 format, ISL_SWIZZLE_IDENTITY,
1947 usage, address, range, 1);
1948
1949 return surface_state;
1950 }
1951
1952 static uint32_t
emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1953 emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1954 struct anv_cmd_pipeline_state *pipe_state,
1955 struct anv_pipeline_binding *binding,
1956 const struct anv_descriptor *desc)
1957 {
1958 struct anv_device *device = cmd_buffer->device;
1959 struct anv_state surface_state;
1960
1961 /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
1962 * Depending on where the descriptor surface state is allocated, they can
1963 * either come from device->internal_surface_state_pool or
1964 * device->bindless_surface_state_pool.
1965 */
1966 switch (desc->type) {
1967 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1968 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1969 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
1970 if (desc->image_view) {
1971 const struct anv_surface_state *sstate =
1972 anv_image_view_texture_surface_state(desc->image_view,
1973 binding->plane,
1974 desc->layout);
1975 surface_state = desc->image_view->use_surface_state_stream ?
1976 sstate->state :
1977 anv_bindless_state_for_binding_table(device, sstate->state);
1978 assert(surface_state.alloc_size);
1979 } else {
1980 surface_state = anv_null_surface_state_for_binding_table(device);
1981 }
1982 break;
1983 }
1984
1985 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
1986 if (desc->image_view) {
1987 const struct anv_surface_state *sstate =
1988 anv_image_view_storage_surface_state(desc->image_view);
1989 surface_state = desc->image_view->use_surface_state_stream ?
1990 sstate->state :
1991 anv_bindless_state_for_binding_table(device, sstate->state);
1992 assert(surface_state.alloc_size);
1993 } else {
1994 surface_state =
1995 anv_null_surface_state_for_binding_table(device);
1996 }
1997 break;
1998 }
1999
2000 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2001 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2002 if (desc->set_buffer_view) {
2003 surface_state = desc->set_buffer_view->general.state;
2004 assert(surface_state.alloc_size);
2005 } else {
2006 surface_state = anv_null_surface_state_for_binding_table(device);
2007 }
2008 break;
2009
2010 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2011 if (desc->buffer_view) {
2012 surface_state = anv_bindless_state_for_binding_table(
2013 device,
2014 desc->buffer_view->general.state);
2015 assert(surface_state.alloc_size);
2016 } else {
2017 surface_state = anv_null_surface_state_for_binding_table(device);
2018 }
2019 break;
2020
2021 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2022 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
2023 surface_state =
2024 emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
2025 binding, desc);
2026 break;
2027
2028 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2029 if (desc->buffer_view) {
2030 surface_state = anv_bindless_state_for_binding_table(
2031 device, desc->buffer_view->storage.state);
2032 assert(surface_state.alloc_size);
2033 } else {
2034 surface_state = anv_null_surface_state_for_binding_table(device);
2035 }
2036 break;
2037
2038 default:
2039 unreachable("Invalid descriptor type");
2040 }
2041
2042 return surface_state.offset;
2043 }
2044
2045 static uint32_t
emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const struct anv_descriptor_set * set,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)2046 emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
2047 struct anv_cmd_pipeline_state *pipe_state,
2048 const struct anv_descriptor_set *set,
2049 struct anv_pipeline_binding *binding,
2050 const struct anv_descriptor *desc)
2051 {
2052 uint32_t desc_offset;
2053
2054 /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
2055 * Depending on where the descriptor surface state is allocated, they can
2056 * either come from device->internal_surface_state_pool or
2057 * device->bindless_surface_state_pool.
2058 */
2059 switch (desc->type) {
2060 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2061 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2062 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
2063 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
2064 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2065 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2066 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2067 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2068 desc_offset = set->desc_offset + binding->set_offset;
2069 break;
2070
2071 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2072 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2073 struct anv_state state =
2074 emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
2075 binding, desc);
2076 desc_offset = state.offset;
2077 break;
2078 }
2079
2080 default:
2081 unreachable("Invalid descriptor type");
2082 }
2083
2084 return desc_offset;
2085 }
2086
2087 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2088 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2089 struct anv_cmd_pipeline_state *pipe_state,
2090 struct anv_shader_bin *shader,
2091 struct anv_state *bt_state)
2092 {
2093 uint32_t state_offset;
2094
2095 struct anv_pipeline_bind_map *map = &shader->bind_map;
2096 if (map->surface_count == 0) {
2097 *bt_state = (struct anv_state) { 0, };
2098 return VK_SUCCESS;
2099 }
2100
2101 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2102 map->surface_count,
2103 &state_offset);
2104 uint32_t *bt_map = bt_state->map;
2105
2106 if (bt_state->map == NULL)
2107 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2108
2109 for (uint32_t s = 0; s < map->surface_count; s++) {
2110 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2111
2112 struct anv_state surface_state;
2113
2114 switch (binding->set) {
2115 case ANV_DESCRIPTOR_SET_NULL:
2116 bt_map[s] = 0;
2117 break;
2118
2119 case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2120 /* Color attachment binding */
2121 assert(shader->stage == MESA_SHADER_FRAGMENT);
2122 uint32_t index = binding->index < MAX_RTS ?
2123 cmd_buffer->state.gfx.color_output_mapping[binding->index] :
2124 binding->index;
2125 if (index < cmd_buffer->state.gfx.color_att_count) {
2126 assert(index < MAX_RTS);
2127 const struct anv_attachment *att =
2128 &cmd_buffer->state.gfx.color_att[index];
2129 surface_state = att->surface_state.state;
2130 } else {
2131 surface_state = cmd_buffer->state.gfx.null_surface_state;
2132 }
2133 assert(surface_state.map);
2134 bt_map[s] = surface_state.offset + state_offset;
2135 break;
2136
2137 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2138 struct anv_descriptor_set *set =
2139 pipe_state->descriptors[binding->index];
2140
2141 /* If the shader doesn't access the set buffer, just put the null
2142 * surface.
2143 */
2144 if (set->is_push && !shader->push_desc_info.used_set_buffer) {
2145 bt_map[s] = 0;
2146 break;
2147 }
2148
2149 /* This is a descriptor set buffer so the set index is actually
2150 * given by binding->binding. (Yes, that's confusing.)
2151 */
2152 assert(set->desc_surface_mem.alloc_size);
2153 assert(set->desc_surface_state.alloc_size);
2154 bt_map[s] = set->desc_surface_state.offset + state_offset;
2155 add_surface_reloc(cmd_buffer, anv_descriptor_set_address(set));
2156 break;
2157 }
2158
2159 case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
2160 assert(pipe_state->descriptor_buffers[binding->index].state.alloc_size);
2161 bt_map[s] = pipe_state->descriptor_buffers[binding->index].state.offset +
2162 state_offset;
2163 break;
2164 }
2165
2166 default: {
2167 assert(binding->set < MAX_SETS);
2168 const struct anv_descriptor_set *set =
2169 pipe_state->descriptors[binding->set];
2170
2171 if (binding->index >= set->descriptor_count) {
2172 /* From the Vulkan spec section entitled "DescriptorSet and
2173 * Binding Assignment":
2174 *
2175 * "If the array is runtime-sized, then array elements greater
2176 * than or equal to the size of that binding in the bound
2177 * descriptor set must not be used."
2178 *
2179 * Unfortunately, the compiler isn't smart enough to figure out
2180 * when a dynamic binding isn't used so it may grab the whole
2181 * array and stick it in the binding table. In this case, it's
2182 * safe to just skip those bindings that are OOB.
2183 */
2184 assert(binding->index < set->layout->descriptor_count);
2185 continue;
2186 }
2187
2188 /* For push descriptor, if the binding is fully promoted to push
2189 * constants, just reference the null surface in the binding table.
2190 * It's unused and we didn't allocate/pack a surface state for it .
2191 */
2192 if (set->is_push) {
2193 uint32_t desc_idx = set->layout->binding[binding->binding].descriptor_index;
2194 assert(desc_idx < MAX_PUSH_DESCRIPTORS);
2195
2196 if (shader->push_desc_info.fully_promoted_ubo_descriptors & BITFIELD_BIT(desc_idx)) {
2197 surface_state =
2198 anv_null_surface_state_for_binding_table(cmd_buffer->device);
2199 break;
2200 }
2201 }
2202
2203 const struct anv_descriptor *desc = &set->descriptors[binding->index];
2204 if (desc->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR ||
2205 desc->type == VK_DESCRIPTOR_TYPE_SAMPLER) {
2206 /* Nothing for us to do here */
2207 continue;
2208 }
2209
2210 const struct anv_pipeline *pipeline = pipe_state->pipeline;
2211 uint32_t surface_state_offset;
2212 if (pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
2213 surface_state_offset =
2214 emit_indirect_descriptor_binding_table_entry(cmd_buffer,
2215 pipe_state,
2216 binding, desc);
2217 } else {
2218 assert(pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ||
2219 pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER);
2220 surface_state_offset =
2221 emit_direct_descriptor_binding_table_entry(cmd_buffer, pipe_state,
2222 set, binding, desc);
2223 }
2224
2225 bt_map[s] = surface_state_offset + state_offset;
2226 break;
2227 }
2228 }
2229 }
2230
2231 return VK_SUCCESS;
2232 }
2233
2234 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2235 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2236 struct anv_cmd_pipeline_state *pipe_state,
2237 struct anv_shader_bin *shader,
2238 struct anv_state *state)
2239 {
2240 struct anv_pipeline_bind_map *map = &shader->bind_map;
2241 if (map->sampler_count == 0) {
2242 *state = (struct anv_state) { 0, };
2243 return VK_SUCCESS;
2244 }
2245
2246 uint32_t size = map->sampler_count * 16;
2247 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2248
2249 if (state->map == NULL)
2250 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2251
2252 for (uint32_t s = 0; s < map->sampler_count; s++) {
2253 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2254 const struct anv_descriptor *desc =
2255 &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2256
2257 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2258 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2259 continue;
2260
2261 struct anv_sampler *sampler = desc->sampler;
2262
2263 /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2264 * happens to be zero.
2265 */
2266 if (sampler == NULL)
2267 continue;
2268
2269 memcpy(state->map + (s * 16), sampler->state[binding->plane],
2270 sizeof(sampler->state[0]));
2271 }
2272
2273 return VK_SUCCESS;
2274 }
2275
2276 uint32_t
genX(cmd_buffer_flush_descriptor_sets)2277 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
2278 struct anv_cmd_pipeline_state *pipe_state,
2279 const VkShaderStageFlags dirty,
2280 struct anv_shader_bin **shaders,
2281 uint32_t num_shaders)
2282 {
2283 VkShaderStageFlags flushed = 0;
2284
2285 VkResult result = VK_SUCCESS;
2286 for (uint32_t i = 0; i < num_shaders; i++) {
2287 if (!shaders[i])
2288 continue;
2289
2290 gl_shader_stage stage = shaders[i]->stage;
2291 VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2292 if ((vk_stage & dirty) == 0)
2293 continue;
2294
2295 assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2296 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2297 &cmd_buffer->state.samplers[stage]);
2298 if (result != VK_SUCCESS)
2299 break;
2300
2301 assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2302 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2303 &cmd_buffer->state.binding_tables[stage]);
2304 if (result != VK_SUCCESS)
2305 break;
2306
2307 flushed |= vk_stage;
2308 }
2309
2310 if (result != VK_SUCCESS) {
2311 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2312
2313 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2314 if (result != VK_SUCCESS)
2315 return 0;
2316
2317 /* Re-emit the BT base address so we get the new surface state base
2318 * address before we start emitting binding tables etc.
2319 */
2320 genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
2321
2322 /* Re-emit all active binding tables */
2323 flushed = 0;
2324
2325 for (uint32_t i = 0; i < num_shaders; i++) {
2326 if (!shaders[i])
2327 continue;
2328
2329 gl_shader_stage stage = shaders[i]->stage;
2330
2331 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2332 &cmd_buffer->state.samplers[stage]);
2333 if (result != VK_SUCCESS) {
2334 anv_batch_set_error(&cmd_buffer->batch, result);
2335 return 0;
2336 }
2337 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2338 &cmd_buffer->state.binding_tables[stage]);
2339 if (result != VK_SUCCESS) {
2340 anv_batch_set_error(&cmd_buffer->batch, result);
2341 return 0;
2342 }
2343
2344 flushed |= mesa_to_vk_shader_stage(stage);
2345 }
2346 }
2347
2348 return flushed;
2349 }
2350
2351 /* This function generates the surface state used to read the content of the
2352 * descriptor buffer.
2353 */
2354 void
genX(cmd_buffer_emit_push_descriptor_buffer_surface)2355 genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
2356 struct anv_descriptor_set *set)
2357 {
2358 assert(set->desc_surface_state.map == NULL);
2359
2360 struct anv_descriptor_set_layout *layout = set->layout;
2361 enum isl_format format =
2362 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2363 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2364
2365 set->desc_surface_state =
2366 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2367 if (set->desc_surface_state.map == NULL)
2368 return;
2369 anv_fill_buffer_surface_state(cmd_buffer->device,
2370 set->desc_surface_state.map,
2371 format, ISL_SWIZZLE_IDENTITY,
2372 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2373 set->desc_surface_addr,
2374 layout->descriptor_buffer_surface_size, 1);
2375 }
2376
2377 /* This functions generates surface states used by a pipeline for push
2378 * descriptors. This is delayed to the draw/dispatch time to avoid allocation
2379 * and surface state generation when a pipeline is not going to use the
2380 * binding table to access any push descriptor data.
2381 */
2382 void
genX(cmd_buffer_emit_push_descriptor_surfaces)2383 genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
2384 struct anv_descriptor_set *set)
2385 {
2386 while (set->generate_surface_states) {
2387 int desc_idx = u_bit_scan(&set->generate_surface_states);
2388 struct anv_descriptor *desc = &set->descriptors[desc_idx];
2389 struct anv_buffer_view *bview = desc->set_buffer_view;
2390
2391 if (bview != NULL && bview->general.state.map == NULL) {
2392 bview->general.state =
2393 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2394 if (bview->general.state.map == NULL)
2395 return;
2396 anv_descriptor_write_surface_state(cmd_buffer->device, desc,
2397 bview->general.state);
2398 }
2399 }
2400 }
2401
2402 ALWAYS_INLINE void
genX(batch_emit_pipe_control)2403 genX(batch_emit_pipe_control)(struct anv_batch *batch,
2404 const struct intel_device_info *devinfo,
2405 uint32_t current_pipeline,
2406 enum anv_pipe_bits bits,
2407 const char *reason)
2408 {
2409 genX(batch_emit_pipe_control_write)(batch,
2410 devinfo,
2411 current_pipeline,
2412 NoWrite,
2413 ANV_NULL_ADDRESS,
2414 0,
2415 bits,
2416 reason);
2417 }
2418
2419 ALWAYS_INLINE void
genX(batch_emit_pipe_control_write)2420 genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
2421 const struct intel_device_info *devinfo,
2422 uint32_t current_pipeline,
2423 uint32_t post_sync_op,
2424 struct anv_address address,
2425 uint32_t imm_data,
2426 enum anv_pipe_bits bits,
2427 const char *reason)
2428 {
2429 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
2430 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
2431 unreachable("Trying to emit unsupported PIPE_CONTROL command.");
2432
2433 const bool trace_flush =
2434 (bits & (ANV_PIPE_FLUSH_BITS |
2435 ANV_PIPE_STALL_BITS |
2436 ANV_PIPE_INVALIDATE_BITS |
2437 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) != 0;
2438 if (trace_flush && batch->trace != NULL) {
2439 // Store pipe control reasons if there is enough space
2440 if (batch->pc_reasons_count < ARRAY_SIZE(batch->pc_reasons)) {
2441 batch->pc_reasons[batch->pc_reasons_count++] = reason;
2442 }
2443 trace_intel_begin_stall(batch->trace);
2444 }
2445
2446
2447 /* XXX - insert all workarounds and GFX specific things below. */
2448
2449 /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
2450 * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
2451 * with CS_STALL Bit set (with No POST_SYNC ENABLED)
2452 */
2453 if (intel_device_info_is_adln(devinfo) &&
2454 current_pipeline == GPGPU &&
2455 post_sync_op != NoWrite) {
2456 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2457 pipe.CommandStreamerStallEnable = true;
2458 anv_debug_dump_pc(pipe, "Wa_14014966230");
2459 };
2460 }
2461
2462 /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
2463 * PIPE_CONTROL, Flush Types:
2464 * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
2465 * For newer platforms this is documented in the PIPE_CONTROL instruction
2466 * page.
2467 */
2468 if (current_pipeline == GPGPU &&
2469 (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
2470 bits |= ANV_PIPE_CS_STALL_BIT;
2471
2472 #if INTEL_NEEDS_WA_1409600907
2473 /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2474 * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2475 */
2476 if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
2477 bits |= ANV_PIPE_DEPTH_STALL_BIT;
2478 #endif
2479
2480 #if GFX_VERx10 >= 125
2481 if (current_pipeline != GPGPU) {
2482 if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
2483 bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2484 } else {
2485 if (bits & (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
2486 ANV_PIPE_DATA_CACHE_FLUSH_BIT))
2487 bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2488 }
2489
2490 /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush:
2491 *
2492 * "'HDC Pipeline Flush' bit must be set for this bit to take
2493 * effect."
2494 */
2495 if (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
2496 bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2497 #endif
2498
2499 #if GFX_VER < 12
2500 if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
2501 bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2502 #endif
2503
2504 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
2505 *
2506 * "If the VF Cache Invalidation Enable is set to a 1 in a
2507 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
2508 * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent
2509 * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
2510 * a 1."
2511 *
2512 * This appears to hang Broadwell, so we restrict it to just gfx9.
2513 */
2514 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
2515 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
2516
2517 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2518 #if GFX_VERx10 >= 125
2519 pipe.UntypedDataPortCacheFlushEnable =
2520 bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2521 pipe.CCSFlushEnable = bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT;
2522 #endif
2523 #if GFX_VER == 12
2524 pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2525 pipe.L3FabricFlush = bits & ANV_PIPE_L3_FABRIC_FLUSH_BIT;
2526 #endif
2527 #if GFX_VER > 11
2528 pipe.HDCPipelineFlushEnable = bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2529 #endif
2530 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2531 pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2532 pipe.RenderTargetCacheFlushEnable =
2533 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2534
2535 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2536
2537 pipe.TLBInvalidate = bits & ANV_PIPE_TLB_INVALIDATE_BIT;
2538
2539 #if GFX_VERx10 >= 125
2540 pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2541 #endif
2542 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2543 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2544
2545 pipe.StateCacheInvalidationEnable =
2546 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2547 pipe.ConstantCacheInvalidationEnable =
2548 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2549 #if GFX_VER >= 12
2550 /* Invalidates the L3 cache part in which index & vertex data is loaded
2551 * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2552 */
2553 pipe.L3ReadOnlyCacheInvalidationEnable =
2554 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2555 #endif
2556 pipe.VFCacheInvalidationEnable =
2557 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2558 pipe.TextureCacheInvalidationEnable =
2559 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2560 pipe.InstructionCacheInvalidateEnable =
2561 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2562
2563 pipe.PostSyncOperation = post_sync_op;
2564 pipe.Address = address;
2565 pipe.DestinationAddressType = DAT_PPGTT;
2566 pipe.ImmediateData = imm_data;
2567
2568 anv_debug_dump_pc(pipe, reason);
2569 }
2570
2571 if (trace_flush && batch->trace != NULL) {
2572 trace_intel_end_stall(batch->trace, bits,
2573 anv_pipe_flush_bit_to_ds_stall_flag,
2574 batch->pc_reasons[0],
2575 batch->pc_reasons[1],
2576 batch->pc_reasons[2],
2577 batch->pc_reasons[3]);
2578 batch->pc_reasons[0] = NULL;
2579 batch->pc_reasons[1] = NULL;
2580 batch->pc_reasons[2] = NULL;
2581 batch->pc_reasons[3] = NULL;
2582 batch->pc_reasons_count = 0;
2583 }
2584 }
2585
2586 /* Set preemption on/off. */
2587 void
genX(batch_set_preemption)2588 genX(batch_set_preemption)(struct anv_batch *batch,
2589 struct anv_device *device,
2590 uint32_t current_pipeline,
2591 bool value)
2592 {
2593 #if INTEL_WA_16013994831_GFX_VER
2594 if (!intel_needs_workaround(device->info, 16013994831))
2595 return;
2596
2597 anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
2598 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !value;
2599 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
2600 }
2601
2602 /* Wa_16013994831 - we need to insert CS_STALL and 250 noops. */
2603 genx_batch_emit_pipe_control(batch, device->info, current_pipeline,
2604 ANV_PIPE_CS_STALL_BIT);
2605
2606 for (unsigned i = 0; i < 250; i++)
2607 anv_batch_emit(batch, GENX(MI_NOOP), noop);
2608 #endif
2609 }
2610
2611 void
genX(cmd_buffer_set_preemption)2612 genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value)
2613 {
2614 #if GFX_VERx10 >= 120
2615 if (cmd_buffer->state.gfx.object_preemption == value)
2616 return;
2617
2618 genX(batch_set_preemption)(&cmd_buffer->batch, cmd_buffer->device,
2619 cmd_buffer->state.current_pipeline,
2620 value);
2621 cmd_buffer->state.gfx.object_preemption = value;
2622 #endif
2623 }
2624
2625 ALWAYS_INLINE static void
update_descriptor_set_surface_state(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,uint32_t set_idx)2626 update_descriptor_set_surface_state(struct anv_cmd_buffer *cmd_buffer,
2627 struct anv_cmd_pipeline_state *pipe_state,
2628 uint32_t set_idx)
2629 {
2630 if (!pipe_state->descriptor_buffers[set_idx].bound)
2631 return;
2632
2633 const struct anv_physical_device *device = cmd_buffer->device->physical;
2634 const int32_t buffer_index =
2635 pipe_state->descriptor_buffers[set_idx].buffer_index;
2636 const struct anv_va_range *push_va_range =
2637 GFX_VERx10 >= 125 ?
2638 &device->va.push_descriptor_buffer_pool :
2639 &device->va.internal_surface_state_pool;
2640 const struct anv_va_range *va_range =
2641 buffer_index == -1 ? push_va_range : &device->va.dynamic_visible_pool;
2642 const uint64_t descriptor_set_addr =
2643 (buffer_index == -1 ? va_range->addr :
2644 cmd_buffer->state.descriptor_buffers.address[buffer_index]) +
2645 pipe_state->descriptor_buffers[set_idx].buffer_offset;
2646 const uint64_t set_size =
2647 MIN2(va_range->size - (descriptor_set_addr - va_range->addr),
2648 anv_physical_device_bindless_heap_size(device, true));
2649
2650 if (descriptor_set_addr != pipe_state->descriptor_buffers[set_idx].address) {
2651 pipe_state->descriptor_buffers[set_idx].address = descriptor_set_addr;
2652
2653 struct anv_state surface_state =
2654 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2655 const enum isl_format format =
2656 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2657 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2658 anv_fill_buffer_surface_state(
2659 cmd_buffer->device, surface_state.map,
2660 format, ISL_SWIZZLE_IDENTITY,
2661 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2662 anv_address_from_u64(pipe_state->descriptor_buffers[set_idx].address),
2663 set_size, 1);
2664
2665 pipe_state->descriptor_buffers[set_idx].state = surface_state;
2666 }
2667 }
2668
2669 ALWAYS_INLINE static uint32_t
compute_descriptor_set_surface_offset(const struct anv_cmd_buffer * cmd_buffer,const struct anv_cmd_pipeline_state * pipe_state,const uint32_t set_idx)2670 compute_descriptor_set_surface_offset(const struct anv_cmd_buffer *cmd_buffer,
2671 const struct anv_cmd_pipeline_state *pipe_state,
2672 const uint32_t set_idx)
2673 {
2674 const struct anv_physical_device *device = cmd_buffer->device->physical;
2675
2676 if (device->uses_ex_bso) {
2677 int32_t buffer_index =
2678 pipe_state->descriptor_buffers[set_idx].buffer_index;
2679 uint64_t buffer_address =
2680 buffer_index == -1 ?
2681 device->va.push_descriptor_buffer_pool.addr :
2682 cmd_buffer->state.descriptor_buffers.address[buffer_index];
2683
2684 return (buffer_address - device->va.dynamic_visible_pool.addr) +
2685 pipe_state->descriptor_buffers[set_idx].buffer_offset;
2686 }
2687
2688 return pipe_state->descriptor_buffers[set_idx].buffer_offset << 6;
2689 }
2690
2691 ALWAYS_INLINE static uint32_t
compute_descriptor_set_sampler_offset(const struct anv_cmd_buffer * cmd_buffer,const struct anv_cmd_pipeline_state * pipe_state,const uint32_t set_idx)2692 compute_descriptor_set_sampler_offset(const struct anv_cmd_buffer *cmd_buffer,
2693 const struct anv_cmd_pipeline_state *pipe_state,
2694 const uint32_t set_idx)
2695 {
2696 const struct anv_physical_device *device = cmd_buffer->device->physical;
2697 int32_t buffer_index =
2698 pipe_state->descriptor_buffers[set_idx].buffer_index;
2699 uint64_t buffer_address =
2700 buffer_index == -1 ?
2701 device->va.push_descriptor_buffer_pool.addr :
2702 cmd_buffer->state.descriptor_buffers.address[buffer_index];
2703
2704 return (buffer_address - device->va.dynamic_state_pool.addr) +
2705 pipe_state->descriptor_buffers[set_idx].buffer_offset;
2706 }
2707
2708 void
genX(flush_descriptor_buffers)2709 genX(flush_descriptor_buffers)(struct anv_cmd_buffer *cmd_buffer,
2710 struct anv_cmd_pipeline_state *pipe_state)
2711 {
2712 /* On Gfx12.5+ the STATE_BASE_ADDRESS BindlessSurfaceStateBaseAddress &
2713 * DynamicStateBaseAddress are fixed. So as long as we stay in one
2714 * descriptor buffer mode, there is no need to switch.
2715 */
2716 #if GFX_VERx10 >= 125
2717 if (cmd_buffer->state.current_db_mode !=
2718 cmd_buffer->state.pending_db_mode)
2719 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2720 #else
2721 if (cmd_buffer->state.descriptor_buffers.dirty)
2722 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2723 #endif
2724
2725 assert(cmd_buffer->state.current_db_mode !=
2726 ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
2727 if (cmd_buffer->state.current_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER &&
2728 (cmd_buffer->state.descriptor_buffers.dirty ||
2729 (pipe_state->pipeline->active_stages &
2730 cmd_buffer->state.descriptor_buffers.offsets_dirty) != 0)) {
2731 struct anv_push_constants *push_constants =
2732 &pipe_state->push_constants;
2733 for (uint32_t i = 0; i < ARRAY_SIZE(push_constants->desc_surface_offsets); i++) {
2734 update_descriptor_set_surface_state(cmd_buffer, pipe_state, i);
2735
2736 push_constants->desc_surface_offsets[i] =
2737 compute_descriptor_set_surface_offset(cmd_buffer, pipe_state, i);
2738 push_constants->desc_sampler_offsets[i] =
2739 compute_descriptor_set_sampler_offset(cmd_buffer, pipe_state, i);
2740 }
2741
2742 #if GFX_VERx10 < 125
2743 struct anv_device *device = cmd_buffer->device;
2744 push_constants->surfaces_base_offset =
2745 (cmd_buffer->state.descriptor_buffers.surfaces_address -
2746 device->physical->va.dynamic_visible_pool.addr);
2747 #endif
2748
2749 cmd_buffer->state.push_constants_dirty |=
2750 (cmd_buffer->state.descriptor_buffers.offsets_dirty &
2751 pipe_state->pipeline->active_stages);
2752 pipe_state->push_constants_data_dirty = true;
2753 cmd_buffer->state.descriptor_buffers.offsets_dirty &=
2754 ~pipe_state->pipeline->active_stages;
2755 }
2756
2757 cmd_buffer->state.descriptor_buffers.dirty = false;
2758 }
2759
2760 void
genX(cmd_buffer_begin_companion)2761 genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *cmd_buffer,
2762 VkCommandBufferLevel level)
2763 {
2764 cmd_buffer->vk.level = level;
2765 cmd_buffer->is_companion_rcs_cmd_buffer = true;
2766
2767 trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
2768
2769 #if GFX_VER >= 12
2770 /* Reenable prefetching at the beginning of secondary command buffers. We
2771 * do this so that the return instruction edition is not prefetched before
2772 * completion.
2773 */
2774 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2775 anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
2776 arb.PreParserDisableMask = true;
2777 arb.PreParserDisable = false;
2778 }
2779 }
2780 #endif
2781
2782 /* A companion command buffer is only used for blorp commands atm, so
2783 * default to the legacy mode.
2784 */
2785 cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
2786 genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
2787
2788 /* Invalidate the aux table in every primary command buffer. This ensures
2789 * the command buffer see the last updates made by the host.
2790 */
2791 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
2792 cmd_buffer->device->info->has_aux_map) {
2793 anv_add_pending_pipe_bits(cmd_buffer,
2794 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
2795 "new cmd buffer with aux-tt");
2796 }
2797 }
2798
2799 static bool
aux_op_resolves(enum isl_aux_op aux_op)2800 aux_op_resolves(enum isl_aux_op aux_op)
2801 {
2802 return aux_op == ISL_AUX_OP_FULL_RESOLVE ||
2803 aux_op == ISL_AUX_OP_PARTIAL_RESOLVE;
2804 }
2805
2806 static bool
aux_op_clears(enum isl_aux_op aux_op)2807 aux_op_clears(enum isl_aux_op aux_op)
2808 {
2809 return aux_op == ISL_AUX_OP_FAST_CLEAR ||
2810 aux_op == ISL_AUX_OP_AMBIGUATE;
2811 }
2812
2813 static bool
aux_op_renders(enum isl_aux_op aux_op)2814 aux_op_renders(enum isl_aux_op aux_op)
2815 {
2816 return aux_op == ISL_AUX_OP_NONE;
2817 }
2818
2819 static void
add_pending_pipe_bits_for_color_aux_op(struct anv_cmd_buffer * cmd_buffer,enum isl_aux_op next_aux_op,enum anv_pipe_bits pipe_bits)2820 add_pending_pipe_bits_for_color_aux_op(struct anv_cmd_buffer *cmd_buffer,
2821 enum isl_aux_op next_aux_op,
2822 enum anv_pipe_bits pipe_bits)
2823 {
2824 const enum isl_aux_op last_aux_op = cmd_buffer->state.color_aux_op;
2825 assert(next_aux_op != last_aux_op);
2826
2827 char flush_reason[64] = {};
2828 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) ||
2829 u_trace_enabled(&cmd_buffer->device->ds.trace_context)) {
2830 int ret = snprintf(flush_reason, sizeof(flush_reason),
2831 "color aux-op: %s -> %s",
2832 isl_aux_op_to_name(last_aux_op),
2833 isl_aux_op_to_name(next_aux_op));
2834 assert(ret < sizeof(flush_reason));
2835 }
2836
2837 anv_add_pending_pipe_bits(cmd_buffer, pipe_bits, flush_reason);
2838 }
2839
2840 void
genX(cmd_buffer_update_color_aux_op)2841 genX(cmd_buffer_update_color_aux_op)(struct anv_cmd_buffer *cmd_buffer,
2842 enum isl_aux_op next_aux_op)
2843 {
2844 const enum isl_aux_op last_aux_op = cmd_buffer->state.color_aux_op;
2845
2846 if (!aux_op_clears(last_aux_op) && aux_op_clears(next_aux_op)) {
2847 #if GFX_VER >= 20
2848 /* From the Xe2 Bspec 57340 (r59562),
2849 * "MCS/CCS Buffers, Fast Clear for Render Target(s)":
2850 *
2851 * Synchronization:
2852 * Due to interaction of scaled clearing rectangle with pixel
2853 * scoreboard, we require one of the following commands to be
2854 * issued. [...]
2855 *
2856 * PIPE_CONTROL
2857 * PSS Stall Sync Enable [...] 1b (Enable)
2858 * Machine-wide Stall at Pixel Stage, wait for all Prior Pixel
2859 * Work to Reach End of Pipe
2860 * Render Target Cache Flush Enable [...] 1b (Enable)
2861 * Post-Sync Op Flushes Render Cache before Unblocking Stall
2862 *
2863 * This synchronization step is required before and after the fast
2864 * clear pass, to ensure correct ordering between pixels.
2865 */
2866 add_pending_pipe_bits_for_color_aux_op(
2867 cmd_buffer, next_aux_op,
2868 ANV_PIPE_PSS_STALL_SYNC_BIT |
2869 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2870
2871 #elif GFX_VERx10 == 125
2872 /* From the ACM Bspec 47704 (r52663), "Render Target Fast Clear":
2873 *
2874 * Preamble pre fast clear synchronization
2875 *
2876 * PIPE_CONTROL:
2877 * PS sync stall = 1
2878 * Tile Cache Flush = 1
2879 * RT Write Flush = 1
2880 * HDC Flush = 1
2881 * DC Flush = 1
2882 * Texture Invalidate = 1
2883 *
2884 * [...]
2885 *
2886 * Objective of the preamble flushes is to ensure all data is
2887 * evicted from L1 caches prior to fast clear.
2888 */
2889 add_pending_pipe_bits_for_color_aux_op(
2890 cmd_buffer, next_aux_op,
2891 ANV_PIPE_PSS_STALL_SYNC_BIT |
2892 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2893 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2894 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
2895 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
2896 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT);
2897
2898 #elif GFX_VERx10 == 120
2899 /* From the TGL Bspec 47704 (r52663), "Render Target Fast Clear":
2900 *
2901 * Preamble pre fast clear synchronization
2902 *
2903 * PIPE_CONTROL:
2904 * Depth Stall = 1
2905 * Tile Cache Flush = 1
2906 * RT Write Flush = 1
2907 * Texture Invalidate = 1
2908 *
2909 * [...]
2910 *
2911 * Objective of the preamble flushes is to ensure all data is
2912 * evicted from L1 caches prior to fast clear.
2913 */
2914 add_pending_pipe_bits_for_color_aux_op(
2915 cmd_buffer, next_aux_op,
2916 ANV_PIPE_DEPTH_STALL_BIT |
2917 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
2918 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2919 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT);
2920
2921 #else
2922 /* From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
2923 *
2924 * Any transition from any value in {Clear, Render, Resolve} to a
2925 * different value in {Clear, Render, Resolve} requires end of pipe
2926 * synchronization.
2927 *
2928 * From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
2929 *
2930 * After Render target fast clear, pipe-control with color cache
2931 * write-flush must be issued before sending any DRAW commands on
2932 * that render target.
2933 *
2934 * The last comment is a bit cryptic and doesn't really tell you what's
2935 * going or what's really needed. It appears that fast clear ops are
2936 * not properly synchronized with other drawing. This means that we
2937 * cannot have a fast clear operation in the pipe at the same time as
2938 * other regular drawing operations. We need to use a PIPE_CONTROL
2939 * to ensure that the contents of the previous draw hit the render
2940 * target before we resolve and then use a second PIPE_CONTROL after
2941 * the resolve to ensure that it is completed before any additional
2942 * drawing occurs.
2943 */
2944 add_pending_pipe_bits_for_color_aux_op(
2945 cmd_buffer, next_aux_op,
2946 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2947 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
2948 #endif
2949
2950 } else if (aux_op_clears(last_aux_op) && !aux_op_clears(next_aux_op)) {
2951 #if GFX_VERx10 >= 125
2952 /* From the ACM PRM Vol. 9, "Color Fast Clear Synchronization":
2953 *
2954 * Postamble post fast clear synchronization
2955 *
2956 * PIPE_CONTROL:
2957 * PS sync stall = 1
2958 * RT flush = 1
2959 */
2960 add_pending_pipe_bits_for_color_aux_op(
2961 cmd_buffer, next_aux_op,
2962 ANV_PIPE_PSS_STALL_SYNC_BIT |
2963 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
2964
2965 #elif GFX_VERx10 == 120
2966 /* From the TGL PRM Vol. 9, "Color Fast Clear Synchronization":
2967 *
2968 * Postamble post fast clear synchronization
2969 *
2970 * PIPE_CONTROL:
2971 * Depth Stall = 1
2972 * Tile Cache Flush = 1
2973 * RT Write Flush = 1
2974 *
2975 * From the TGL PRM Vol. 2a, "PIPE_CONTROL::L3 Fabric Flush":
2976 *
2977 * For a sequence of color fast clears. A single PIPE_CONTROL
2978 * command with Render Target Cache Flush, L3 Fabric Flush and Depth
2979 * Stall set at the end of the sequence suffices.
2980 *
2981 * Replace the Tile Cache flush with an L3 fabric flush.
2982 */
2983 add_pending_pipe_bits_for_color_aux_op(
2984 cmd_buffer, next_aux_op,
2985 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
2986 ANV_PIPE_L3_FABRIC_FLUSH_BIT |
2987 ANV_PIPE_DEPTH_STALL_BIT);
2988
2989 #else
2990 /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
2991 *
2992 * After Render target fast clear, pipe-control with color cache
2993 * write-flush must be issued before sending any DRAW commands on
2994 * that render target.
2995 *
2996 * From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
2997 *
2998 * Any transition from any value in {Clear, Render, Resolve} to a
2999 * different value in {Clear, Render, Resolve} requires end of pipe
3000 * synchronization.
3001 */
3002 add_pending_pipe_bits_for_color_aux_op(
3003 cmd_buffer, next_aux_op,
3004 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3005 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3006 #endif
3007
3008 } else if (aux_op_renders(last_aux_op) != aux_op_renders(next_aux_op)) {
3009 assert(aux_op_resolves(last_aux_op) != aux_op_resolves(next_aux_op));
3010 /* From the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
3011 *
3012 * Any transition from any value in {Clear, Render, Resolve} to a
3013 * different value in {Clear, Render, Resolve} requires end of pipe
3014 * synchronization.
3015 *
3016 * We perform a flush of the write cache before and after the clear and
3017 * resolve operations to meet this requirement.
3018 *
3019 * Unlike other drawing, fast clear operations are not properly
3020 * synchronized. The first PIPE_CONTROL here likely ensures that the
3021 * contents of the previous render or clear hit the render target before
3022 * we resolve and the second likely ensures that the resolve is complete
3023 * before we do any more rendering or clearing.
3024 */
3025 add_pending_pipe_bits_for_color_aux_op(
3026 cmd_buffer, next_aux_op,
3027 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3028 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
3029 }
3030
3031 if (last_aux_op != ISL_AUX_OP_FAST_CLEAR &&
3032 next_aux_op == ISL_AUX_OP_FAST_CLEAR &&
3033 cmd_buffer->device->isl_dev.ss.clear_color_state_size > 0) {
3034 /* From the ICL PRM Vol. 9, "State Caching":
3035 *
3036 * Any values referenced by pointers within the RENDER_SURFACE_STATE
3037 * [...] (e.g. Clear Color Pointer, [...]) are considered to be part
3038 * of that state and any changes to these referenced values requires
3039 * an invalidation of the L1 state cache to ensure the new values are
3040 * being used as part of the state. [...]
3041 *
3042 * We could alternatively perform this invalidation when we stop
3043 * fast-clearing. A benefit to doing it now, when transitioning to a
3044 * fast clear, is that we save a pipe control by combining the state
3045 * cache invalidation with the texture cache invalidation done on gfx12.
3046 */
3047 anv_add_pending_pipe_bits(cmd_buffer,
3048 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
3049 "Invalidate for new clear color");
3050 }
3051
3052 /* Update the auxiliary surface operation, but with one exception. */
3053 if (last_aux_op == ISL_AUX_OP_FAST_CLEAR &&
3054 next_aux_op == ISL_AUX_OP_AMBIGUATE) {
3055 assert(aux_op_clears(last_aux_op) && aux_op_clears(next_aux_op));
3056 /* Fast clears and ambiguates are in the same class of operation, but
3057 * fast clears have more stringent synchronization requirements. For
3058 * better performance, don't replace the current fast clear operation
3059 * state with ambiguate. This allows us to perform one state cache
3060 * invalidation when leaving a sequence which alternates between
3061 * ambiguates and clears, instead of multiple such invalidations.
3062 */
3063 } else {
3064 cmd_buffer->state.color_aux_op = next_aux_op;
3065 }
3066
3067 if (next_aux_op == ISL_AUX_OP_FAST_CLEAR) {
3068 if (aux_op_clears(last_aux_op)) {
3069 cmd_buffer->num_dependent_clears++;
3070 } else {
3071 cmd_buffer->num_independent_clears++;
3072 }
3073 }
3074 }
3075
3076 static void
genX(cmd_buffer_set_protected_memory)3077 genX(cmd_buffer_set_protected_memory)(struct anv_cmd_buffer *cmd_buffer,
3078 bool enabled)
3079 {
3080 #if GFX_VER >= 12
3081 if (enabled) {
3082 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SET_APPID), appid) {
3083 /* Default value for single session. */
3084 appid.ProtectedMemoryApplicationID = cmd_buffer->device->protected_session_id;
3085 appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
3086 }
3087 }
3088 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3089 pc.PipeControlFlushEnable = true;
3090 pc.DCFlushEnable = true;
3091 pc.RenderTargetCacheFlushEnable = true;
3092 pc.CommandStreamerStallEnable = true;
3093 if (enabled)
3094 pc.ProtectedMemoryEnable = true;
3095 else
3096 pc.ProtectedMemoryDisable = true;
3097 }
3098 #else
3099 unreachable("Protected content not supported");
3100 #endif
3101 }
3102
3103 VkResult
genX(BeginCommandBuffer)3104 genX(BeginCommandBuffer)(
3105 VkCommandBuffer commandBuffer,
3106 const VkCommandBufferBeginInfo* pBeginInfo)
3107 {
3108 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3109 VkResult result;
3110
3111 /* If this is the first vkBeginCommandBuffer, we must *initialize* the
3112 * command buffer's state. Otherwise, we must *reset* its state. In both
3113 * cases we reset it.
3114 *
3115 * From the Vulkan 1.0 spec:
3116 *
3117 * If a command buffer is in the executable state and the command buffer
3118 * was allocated from a command pool with the
3119 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
3120 * vkBeginCommandBuffer implicitly resets the command buffer, behaving
3121 * as if vkResetCommandBuffer had been called with
3122 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
3123 * the command buffer in the recording state.
3124 */
3125 anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
3126 anv_cmd_buffer_reset_rendering(cmd_buffer);
3127
3128 cmd_buffer->usage_flags = pBeginInfo->flags;
3129
3130 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3131 * primary level command buffers.
3132 *
3133 * From the Vulkan 1.0 spec:
3134 *
3135 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3136 * secondary command buffer is considered to be entirely inside a render
3137 * pass. If this is a primary command buffer, then this bit is ignored.
3138 */
3139 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
3140 cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3141
3142 #if GFX_VER >= 12
3143 /* Reenable prefetching at the beginning of secondary command buffers. We
3144 * do this so that the return instruction edition is not prefetched before
3145 * completion.
3146 */
3147 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3148 anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
3149 arb.PreParserDisableMask = true;
3150 arb.PreParserDisable = false;
3151 }
3152 }
3153 #endif
3154
3155 /* Assume the viewport has already been set in primary command buffers. */
3156 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
3157 cmd_buffer->state.gfx.viewport_set = true;
3158
3159 trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
3160
3161 if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
3162 anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
3163 /* Invalidate the aux table in every primary command buffer. This
3164 * ensures the command buffer see the last updates made by the host.
3165 */
3166 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3167 cmd_buffer->device->info->has_aux_map) {
3168 anv_add_pending_pipe_bits(cmd_buffer,
3169 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
3170 "new cmd buffer with aux-tt");
3171 }
3172 return VK_SUCCESS;
3173 }
3174
3175 #if GFX_VER >= 12
3176 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3177 cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3178 genX(cmd_buffer_set_protected_memory)(cmd_buffer, true);
3179 #endif
3180
3181 if (cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer) {
3182 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
3183 } else {
3184 cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
3185 genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
3186 }
3187
3188 /* We sometimes store vertex data in the dynamic state buffer for blorp
3189 * operations and our dynamic state stream may re-use data from previous
3190 * command buffers. In order to prevent stale cache data, we flush the VF
3191 * cache. We could do this on every blorp call but that's not really
3192 * needed as all of the data will get written by the CPU prior to the GPU
3193 * executing anything. The chances are fairly high that they will use
3194 * blorp at least once per primary command buffer so it shouldn't be
3195 * wasted.
3196 *
3197 * There is also a workaround on gfx8 which requires us to invalidate the
3198 * VF cache occasionally. It's easier if we can assume we start with a
3199 * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
3200 */
3201 anv_add_pending_pipe_bits(cmd_buffer,
3202 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
3203 "new cmd buffer");
3204
3205 /* Invalidate the aux table in every primary command buffer. This ensures
3206 * the command buffer see the last updates made by the host.
3207 */
3208 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3209 cmd_buffer->device->info->has_aux_map) {
3210 anv_add_pending_pipe_bits(cmd_buffer,
3211 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
3212 "new cmd buffer with aux-tt");
3213 }
3214
3215 /* We send an "Indirect State Pointers Disable" packet at
3216 * EndCommandBuffer, so all push constant packets are ignored during a
3217 * context restore. Documentation says after that command, we need to
3218 * emit push constants again before any rendering operation. So we
3219 * flag them dirty here to make sure they get emitted.
3220 */
3221 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
3222 cmd_buffer->state.gfx.base.push_constants_data_dirty = true;
3223
3224 if (cmd_buffer->usage_flags &
3225 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3226 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3227
3228 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
3229 const VkRenderingInfo *resume_info =
3230 vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
3231 pBeginInfo,
3232 gcbiar_data);
3233 if (resume_info != NULL) {
3234 genX(CmdBeginRendering)(commandBuffer, resume_info);
3235 } else {
3236 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
3237 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
3238 pBeginInfo);
3239 assert(inheritance_info);
3240
3241 gfx->rendering_flags = inheritance_info->flags;
3242 gfx->render_area = (VkRect2D) { };
3243 gfx->layer_count = 0;
3244 gfx->samples = inheritance_info->rasterizationSamples;
3245 gfx->view_mask = inheritance_info->viewMask;
3246
3247 uint32_t color_att_count = inheritance_info->colorAttachmentCount;
3248 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
3249 if (result != VK_SUCCESS)
3250 return result;
3251
3252 for (uint32_t i = 0; i < color_att_count; i++) {
3253 gfx->color_att[i].vk_format =
3254 inheritance_info->pColorAttachmentFormats[i];
3255 }
3256 gfx->depth_att.vk_format =
3257 inheritance_info->depthAttachmentFormat;
3258 gfx->stencil_att.vk_format =
3259 inheritance_info->stencilAttachmentFormat;
3260
3261 anv_cmd_graphic_state_update_has_uint_rt(gfx);
3262
3263 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_AREA |
3264 ANV_CMD_DIRTY_RENDER_TARGETS;
3265 }
3266 }
3267
3268 /* Emit the sample pattern at the beginning of the batch because the
3269 * default locations emitted at the device initialization might have been
3270 * changed by a previous command buffer.
3271 *
3272 * Do not change that when we're continuing a previous renderpass.
3273 */
3274 if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
3275 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
3276 genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
3277
3278 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3279 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
3280 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
3281
3282 /* If secondary buffer supports conditional rendering
3283 * we should emit commands as if conditional rendering is enabled.
3284 */
3285 cmd_buffer->state.conditional_render_enabled =
3286 conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
3287
3288 if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable) {
3289 cmd_buffer->state.gfx.n_occlusion_queries = 1;
3290 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
3291 }
3292 }
3293
3294 return VK_SUCCESS;
3295 }
3296
3297 /* From the PRM, Volume 2a:
3298 *
3299 * "Indirect State Pointers Disable
3300 *
3301 * At the completion of the post-sync operation associated with this pipe
3302 * control packet, the indirect state pointers in the hardware are
3303 * considered invalid; the indirect pointers are not saved in the context.
3304 * If any new indirect state commands are executed in the command stream
3305 * while the pipe control is pending, the new indirect state commands are
3306 * preserved.
3307 *
3308 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
3309 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
3310 * commands are only considered as Indirect State Pointers. Once ISP is
3311 * issued in a context, SW must initialize by programming push constant
3312 * commands for all the shaders (at least to zero length) before attempting
3313 * any rendering operation for the same context."
3314 *
3315 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
3316 * even though they point to a BO that has been already unreferenced at
3317 * the end of the previous batch buffer. This has been fine so far since
3318 * we are protected by these scratch page (every address not covered by
3319 * a BO should be pointing to the scratch page). But on CNL, it is
3320 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
3321 * instruction.
3322 *
3323 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
3324 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
3325 * context restore, so the mentioned hang doesn't happen. However,
3326 * software must program push constant commands for all stages prior to
3327 * rendering anything. So we flag them dirty in BeginCommandBuffer.
3328 *
3329 * Finally, we also make sure to stall at pixel scoreboard to make sure the
3330 * constants have been loaded into the EUs prior to disable the push constants
3331 * so that it doesn't hang a previous 3DPRIMITIVE.
3332 */
3333 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)3334 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
3335 {
3336 genx_batch_emit_pipe_control(&cmd_buffer->batch,
3337 cmd_buffer->device->info,
3338 cmd_buffer->state.current_pipeline,
3339 ANV_PIPE_CS_STALL_BIT |
3340 ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
3341 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3342 pc.IndirectStatePointersDisable = true;
3343 pc.CommandStreamerStallEnable = true;
3344 anv_debug_dump_pc(pc, __func__);
3345 }
3346 }
3347
3348 static VkResult
end_command_buffer(struct anv_cmd_buffer * cmd_buffer)3349 end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
3350 {
3351 if (anv_batch_has_error(&cmd_buffer->batch))
3352 return cmd_buffer->batch.status;
3353
3354 anv_measure_endcommandbuffer(cmd_buffer);
3355
3356 if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
3357 anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
3358 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
3359 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3360 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
3361 return VK_SUCCESS;
3362 }
3363
3364 /* Flush query clears using blorp so that secondary query writes do not
3365 * race with the clear.
3366 */
3367 if (cmd_buffer->state.queries.clear_bits) {
3368 anv_add_pending_pipe_bits(cmd_buffer,
3369 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
3370 "query clear flush prior command buffer end");
3371 }
3372
3373 /* Flush any in-progress CCS/MCS operations in preparation for chaining. */
3374 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
3375
3376 genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
3377
3378 /* Turn on object level preemption if it is disabled to have it in known
3379 * state at the beginning of new command buffer.
3380 */
3381 if (!cmd_buffer->state.gfx.object_preemption)
3382 genX(cmd_buffer_set_preemption)(cmd_buffer, true);
3383
3384 /* We want every command buffer to start with the PMA fix in a known state,
3385 * so we disable it at the end of the command buffer.
3386 */
3387 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
3388
3389 /* Wa_14015814527
3390 *
3391 * Apply task URB workaround in the end of primary or secondary cmd_buffer.
3392 */
3393 genX(apply_task_urb_workaround)(cmd_buffer);
3394
3395 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3396
3397 emit_isp_disable(cmd_buffer);
3398
3399 #if GFX_VER >= 12
3400 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
3401 cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3402 genX(cmd_buffer_set_protected_memory)(cmd_buffer, false);
3403 #endif
3404
3405 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
3406
3407 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
3408
3409 return VK_SUCCESS;
3410 }
3411
3412 VkResult
genX(EndCommandBuffer)3413 genX(EndCommandBuffer)(
3414 VkCommandBuffer commandBuffer)
3415 {
3416 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3417
3418 VkResult status = end_command_buffer(cmd_buffer);
3419 if (status != VK_SUCCESS)
3420 return status;
3421
3422 /* If there is MSAA access over the compute/transfer queue, we can use the
3423 * companion RCS command buffer and end it properly.
3424 */
3425 if (cmd_buffer->companion_rcs_cmd_buffer) {
3426 assert(anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
3427 anv_cmd_buffer_is_blitter_queue(cmd_buffer));
3428 status = end_command_buffer(cmd_buffer->companion_rcs_cmd_buffer);
3429 }
3430
3431 ANV_RMV(cmd_buffer_create, cmd_buffer->device, cmd_buffer);
3432
3433 return status;
3434 }
3435
3436 void
genX(CmdExecuteCommands)3437 genX(CmdExecuteCommands)(
3438 VkCommandBuffer commandBuffer,
3439 uint32_t commandBufferCount,
3440 const VkCommandBuffer* pCmdBuffers)
3441 {
3442 ANV_FROM_HANDLE(anv_cmd_buffer, container, commandBuffer);
3443
3444 struct anv_device *device = container->device;
3445
3446 if (anv_batch_has_error(&container->batch))
3447 return;
3448
3449 /* The secondary command buffers will assume that the PMA fix is disabled
3450 * when they begin executing. Make sure this is true.
3451 */
3452 genX(cmd_buffer_enable_pma_fix)(container, false);
3453
3454 /* Turn on preemption in case it was toggled off. */
3455 if (!container->state.gfx.object_preemption)
3456 genX(cmd_buffer_set_preemption)(container, true);
3457
3458 /* Wa_14015814527
3459 *
3460 * Apply task URB workaround before secondary cmd buffers.
3461 */
3462 genX(apply_task_urb_workaround)(container);
3463
3464 /* Flush query clears using blorp so that secondary query writes do not
3465 * race with the clear.
3466 */
3467 if (container->state.queries.clear_bits) {
3468 anv_add_pending_pipe_bits(container,
3469 ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits),
3470 "query clear flush prior to secondary buffer");
3471 }
3472
3473 /* Ensure we're in a regular drawing cache mode (assumption for all
3474 * secondary).
3475 */
3476 genX(cmd_buffer_update_color_aux_op(container, ISL_AUX_OP_NONE));
3477
3478 /* The secondary command buffer doesn't know which textures etc. have been
3479 * flushed prior to their execution. Apply those flushes now.
3480 */
3481 genX(cmd_buffer_apply_pipe_flushes)(container);
3482
3483 genX(cmd_buffer_flush_generated_draws)(container);
3484
3485 UNUSED enum anv_cmd_descriptor_buffer_mode db_mode =
3486 container->state.current_db_mode;
3487
3488 /* Do a first pass to copy the surface state content of the render targets
3489 * if needed.
3490 */
3491 bool need_surface_state_copy = false;
3492 for (uint32_t i = 0; i < commandBufferCount; i++) {
3493 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3494
3495 if (secondary->usage_flags &
3496 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3497 need_surface_state_copy = true;
3498 break;
3499 }
3500 }
3501
3502 if (need_surface_state_copy) {
3503 if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3504 genX(cmd_buffer_set_protected_memory)(container, false);
3505
3506 /* The memcpy will take care of the 3D preemption requirements. */
3507 struct anv_memcpy_state memcpy_state;
3508 genX(emit_so_memcpy_init)(&memcpy_state, device,
3509 container, &container->batch);
3510
3511 for (uint32_t i = 0; i < commandBufferCount; i++) {
3512 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3513
3514 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
3515 assert(!anv_batch_has_error(&secondary->batch));
3516
3517 if (secondary->usage_flags &
3518 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3519 /* If we're continuing a render pass from the container, we need
3520 * to copy the surface states for the current subpass into the
3521 * storage we allocated for them in BeginCommandBuffer.
3522 */
3523 struct anv_state src_state = container->state.gfx.att_states;
3524 struct anv_state dst_state = secondary->state.gfx.att_states;
3525 assert(src_state.alloc_size == dst_state.alloc_size);
3526
3527 genX(emit_so_memcpy)(
3528 &memcpy_state,
3529 anv_state_pool_state_address(&device->internal_surface_state_pool,
3530 dst_state),
3531 anv_state_pool_state_address(&device->internal_surface_state_pool,
3532 src_state),
3533 src_state.alloc_size);
3534 }
3535 }
3536 genX(emit_so_memcpy_fini)(&memcpy_state);
3537
3538 anv_add_pending_pipe_bits(container,
3539 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
3540 "Wait for primary->secondary RP surface state copies");
3541 genX(cmd_buffer_apply_pipe_flushes)(container);
3542
3543 if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
3544 genX(cmd_buffer_set_protected_memory)(container, true);
3545 }
3546
3547 /* Ensure preemption is enabled (assumption for all secondary) */
3548 genX(cmd_buffer_set_preemption)(container, true);
3549
3550 for (uint32_t i = 0; i < commandBufferCount; i++) {
3551 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3552
3553 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
3554 assert(!anv_batch_has_error(&secondary->batch));
3555
3556 if (secondary->state.conditional_render_enabled) {
3557 if (!container->state.conditional_render_enabled) {
3558 /* Secondary buffer is constructed as if it will be executed
3559 * with conditional rendering, we should satisfy this dependency
3560 * regardless of conditional rendering being enabled in container.
3561 */
3562 struct mi_builder b;
3563 mi_builder_init(&b, device->info, &container->batch);
3564 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
3565 mi_imm(UINT64_MAX));
3566 }
3567 }
3568
3569 anv_cmd_buffer_add_secondary(container, secondary);
3570
3571 /* Add secondary buffer's RCS command buffer to container buffer's RCS
3572 * command buffer for execution if secondary RCS is valid.
3573 */
3574 if (secondary->companion_rcs_cmd_buffer != NULL) {
3575 VkResult result = anv_cmd_buffer_ensure_rcs_companion(container);
3576 if (result != VK_SUCCESS) {
3577 anv_batch_set_error(&container->batch, result);
3578 return;
3579 }
3580
3581 anv_cmd_buffer_add_secondary(container->companion_rcs_cmd_buffer,
3582 secondary->companion_rcs_cmd_buffer);
3583 }
3584
3585 assert(secondary->perf_query_pool == NULL || container->perf_query_pool == NULL ||
3586 secondary->perf_query_pool == container->perf_query_pool);
3587 if (secondary->perf_query_pool)
3588 container->perf_query_pool = secondary->perf_query_pool;
3589
3590 #if INTEL_NEEDS_WA_1808121037
3591 if (secondary->state.gfx.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
3592 container->state.gfx.depth_reg_mode = secondary->state.gfx.depth_reg_mode;
3593 #endif
3594
3595 container->state.gfx.viewport_set |= secondary->state.gfx.viewport_set;
3596
3597 db_mode = secondary->state.current_db_mode;
3598 }
3599
3600 /* The secondary isn't counted in our VF cache tracking so we need to
3601 * invalidate the whole thing.
3602 */
3603 if (GFX_VER == 9) {
3604 anv_add_pending_pipe_bits(container,
3605 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
3606 "Secondary cmd buffer not tracked in VF cache");
3607 }
3608
3609 #if INTEL_WA_16014538804_GFX_VER
3610 if (anv_cmd_buffer_is_render_queue(container) &&
3611 intel_needs_workaround(device->info, 16014538804))
3612 anv_batch_emit(&container->batch, GENX(PIPE_CONTROL), pc);
3613 #endif
3614
3615 /* The secondary may have selected a different pipeline (3D or compute) and
3616 * may have changed the current L3$ configuration. Reset our tracking
3617 * variables to invalid values to ensure that we re-emit these in the case
3618 * where we do any draws or compute dispatches from the container after the
3619 * secondary has returned.
3620 */
3621 container->state.current_pipeline = UINT32_MAX;
3622 container->state.current_l3_config = NULL;
3623 container->state.current_hash_scale = 0;
3624 container->state.gfx.push_constant_stages = 0;
3625
3626 memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config));
3627
3628 /* Reemit all GFX instructions in container */
3629 memcpy(container->state.gfx.dyn_state.dirty,
3630 device->gfx_dirty_state,
3631 sizeof(container->state.gfx.dyn_state.dirty));
3632 if (container->device->vk.enabled_extensions.KHR_fragment_shading_rate) {
3633 /* Also recompute the CPS_STATE offset */
3634 struct vk_dynamic_graphics_state *dyn =
3635 &container->vk.dynamic_graphics_state;
3636 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_FSR);
3637 }
3638
3639 /* Each of the secondary command buffers will use its own state base
3640 * address. We need to re-emit state base address for the container after
3641 * all of the secondaries are done.
3642 */
3643 if (container->device->vk.enabled_extensions.EXT_descriptor_buffer) {
3644 #if GFX_VERx10 >= 125
3645 /* If the last secondary had a different mode, reemit the last pending
3646 * mode. Otherwise, we can do a lighter binding table pool update.
3647 */
3648 if (db_mode != container->state.current_db_mode) {
3649 container->state.current_db_mode = db_mode;
3650 genX(cmd_buffer_emit_state_base_address)(container);
3651 } else {
3652 genX(cmd_buffer_emit_bt_pool_base_address)(container);
3653 }
3654 #else
3655 genX(cmd_buffer_emit_state_base_address)(container);
3656 #endif
3657 } else {
3658 genX(cmd_buffer_emit_bt_pool_base_address)(container);
3659 }
3660
3661 /* Copy of utrace timestamp buffers from secondary into container */
3662 if (u_trace_enabled(&device->ds.trace_context)) {
3663 trace_intel_begin_trace_copy(&container->trace);
3664
3665 struct anv_memcpy_state memcpy_state;
3666 genX(emit_so_memcpy_init)(&memcpy_state, device,
3667 container, &container->batch);
3668 uint32_t num_traces = 0;
3669 for (uint32_t i = 0; i < commandBufferCount; i++) {
3670 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
3671
3672 num_traces += secondary->trace.num_traces;
3673 u_trace_clone_append(u_trace_begin_iterator(&secondary->trace),
3674 u_trace_end_iterator(&secondary->trace),
3675 &container->trace,
3676 &memcpy_state,
3677 anv_device_utrace_emit_gfx_copy_buffer);
3678 }
3679 genX(emit_so_memcpy_fini)(&memcpy_state);
3680
3681 trace_intel_end_trace_copy(&container->trace, num_traces);
3682
3683 /* Memcpy is done using the 3D pipeline. */
3684 container->state.current_pipeline = _3D;
3685 }
3686 }
3687
3688 static inline enum anv_pipe_bits
anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3689 anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3690 VkAccessFlags2 flags)
3691 {
3692 enum anv_pipe_bits pipe_bits = 0;
3693
3694 u_foreach_bit64(b, flags) {
3695 switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3696 case VK_ACCESS_2_SHADER_WRITE_BIT:
3697 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
3698 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3699 /* We're transitioning a buffer that was previously used as write
3700 * destination through the data port. To make its content available
3701 * to future operations, flush the hdc pipeline.
3702 */
3703 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3704 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3705 break;
3706 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
3707 /* We're transitioning a buffer that was previously used as render
3708 * target. To make its content available to future operations, flush
3709 * the render target cache.
3710 */
3711 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3712 break;
3713 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3714 /* We're transitioning a buffer that was previously used as depth
3715 * buffer. To make its content available to future operations, flush
3716 * the depth cache.
3717 */
3718 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3719 break;
3720 case VK_ACCESS_2_TRANSFER_WRITE_BIT:
3721 /* We're transitioning a buffer that was previously used as a
3722 * transfer write destination. Generic write operations include color
3723 * & depth operations as well as buffer operations like :
3724 * - vkCmdClearColorImage()
3725 * - vkCmdClearDepthStencilImage()
3726 * - vkCmdBlitImage()
3727 * - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
3728 *
3729 * Most of these operations are implemented using Blorp which writes
3730 * through the render target cache or the depth cache on the graphics
3731 * queue. On the compute queue, the writes are done through the data
3732 * port.
3733 */
3734 if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
3735 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3736 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3737 } else {
3738 /* We can use the data port when trying to stay in compute mode on
3739 * the RCS.
3740 */
3741 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3742 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3743 /* Most operations are done through RT/detph writes */
3744 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3745 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3746 }
3747 break;
3748 case VK_ACCESS_2_MEMORY_WRITE_BIT:
3749 /* We're transitioning a buffer for generic write operations. Flush
3750 * all the caches.
3751 */
3752 pipe_bits |= ANV_PIPE_BARRIER_FLUSH_BITS;
3753 break;
3754 case VK_ACCESS_2_HOST_WRITE_BIT:
3755 /* We're transitioning a buffer for access by CPU. Invalidate
3756 * all the caches. Since data and tile caches don't have invalidate,
3757 * we are forced to flush those as well.
3758 */
3759 pipe_bits |= ANV_PIPE_BARRIER_FLUSH_BITS;
3760 pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3761 break;
3762 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3763 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3764 /* We're transitioning a buffer written either from VS stage or from
3765 * the command streamer (see CmdEndTransformFeedbackEXT), we just
3766 * need to stall the CS.
3767 *
3768 * Streamout writes apparently bypassing L3, in order to make them
3769 * visible to the destination, we need to invalidate the other
3770 * caches.
3771 */
3772 pipe_bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_INVALIDATE_BITS;
3773 break;
3774 default:
3775 break; /* Nothing to do */
3776 }
3777 }
3778
3779 return pipe_bits;
3780 }
3781
3782 static inline enum anv_pipe_bits
anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3783 anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3784 VkAccessFlags2 flags)
3785 {
3786 struct anv_device *device = cmd_buffer->device;
3787 enum anv_pipe_bits pipe_bits = 0;
3788
3789 u_foreach_bit64(b, flags) {
3790 switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3791 case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
3792 /* Indirect draw commands take a buffer as input that we're going to
3793 * read from the command streamer to load some of the HW registers
3794 * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
3795 * command streamer stall so that all the cache flushes have
3796 * completed before the command streamer loads from memory.
3797 */
3798 pipe_bits |= ANV_PIPE_CS_STALL_BIT;
3799 if (device->info->ver == 9) {
3800 /* Indirect draw commands on Gfx9 also set gl_BaseVertex &
3801 * gl_BaseIndex through a vertex buffer, so invalidate that cache.
3802 */
3803 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3804 }
3805 /* For CmdDipatchIndirect, we load indirect gl_NumWorkGroups through
3806 * an A64 message, so we need to invalidate constant cache.
3807 */
3808 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3809 /* Tile & Data cache flush needed For Cmd*Indirect* commands since
3810 * command streamer is not L3 coherent.
3811 */
3812 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT |
3813 ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3814 break;
3815 case VK_ACCESS_2_INDEX_READ_BIT:
3816 case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
3817 /* We transitioning a buffer to be used for as input for vkCmdDraw*
3818 * commands, so we invalidate the VF cache to make sure there is no
3819 * stale data when we start rendering.
3820 */
3821 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3822 break;
3823 case VK_ACCESS_2_UNIFORM_READ_BIT:
3824 case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
3825 /* We transitioning a buffer to be used as uniform data. Because
3826 * uniform is accessed through the data port & sampler, we need to
3827 * invalidate the texture cache (sampler) & constant cache (data
3828 * port) to avoid stale data.
3829 */
3830 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3831 if (device->physical->compiler->indirect_ubos_use_sampler) {
3832 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3833 } else {
3834 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3835 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3836 }
3837 break;
3838 case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
3839 case VK_ACCESS_2_TRANSFER_READ_BIT:
3840 case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
3841 /* Transitioning a buffer to be read through the sampler, so
3842 * invalidate the texture cache, we don't want any stale data.
3843 */
3844 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3845 break;
3846 case VK_ACCESS_2_SHADER_READ_BIT:
3847 /* Same as VK_ACCESS_2_UNIFORM_READ_BIT and
3848 * VK_ACCESS_2_SHADER_SAMPLED_READ_BIT cases above
3849 */
3850 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
3851 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3852 if (!device->physical->compiler->indirect_ubos_use_sampler) {
3853 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3854 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3855 }
3856 break;
3857 case VK_ACCESS_2_MEMORY_READ_BIT:
3858 /* Transitioning a buffer for generic read, invalidate all the
3859 * caches.
3860 */
3861 pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3862 break;
3863 case VK_ACCESS_2_MEMORY_WRITE_BIT:
3864 /* Generic write, make sure all previously written things land in
3865 * memory.
3866 */
3867 pipe_bits |= ANV_PIPE_BARRIER_FLUSH_BITS;
3868 break;
3869 case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT:
3870 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT:
3871 /* Transitioning a buffer for conditional rendering or transform
3872 * feedback. We'll load the content of this buffer into HW registers
3873 * using the command streamer, so we need to stall the command
3874 * streamer , so we need to stall the command streamer to make sure
3875 * any in-flight flush operations have completed.
3876 */
3877 pipe_bits |= ANV_PIPE_CS_STALL_BIT;
3878 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3879 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3880 break;
3881 case VK_ACCESS_2_HOST_READ_BIT:
3882 /* We're transitioning a buffer that was written by CPU. Flush
3883 * all the caches.
3884 */
3885 pipe_bits |= ANV_PIPE_BARRIER_FLUSH_BITS;
3886 break;
3887 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3888 /* We're transitioning a buffer to be written by the streamout fixed
3889 * function. This one is apparently not L3 coherent, so we need a
3890 * tile cache flush to make sure any previous write is not going to
3891 * create WaW hazards.
3892 */
3893 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3894 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3895 break;
3896 case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
3897 /* VK_ACCESS_2_SHADER_STORAGE_READ_BIT specifies read access to a
3898 * storage buffer, physical storage buffer, storage texel buffer, or
3899 * storage image in any shader pipeline stage.
3900 *
3901 * Any storage buffers or images written to must be invalidated and
3902 * flushed before the shader can access them.
3903 *
3904 * Both HDC & Untyped flushes also do invalidation. This is why we
3905 * use this here on Gfx12+.
3906 *
3907 * Gfx11 and prior don't have HDC. Only Data cache flush is available
3908 * and it only operates on the written cache lines.
3909 */
3910 if (device->info->ver >= 12) {
3911 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3912 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3913 }
3914 break;
3915 case VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT:
3916 pipe_bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
3917 break;
3918 default:
3919 break; /* Nothing to do */
3920 }
3921 }
3922
3923 return pipe_bits;
3924 }
3925
3926 static inline bool
stage_is_shader(const VkPipelineStageFlags2 stage)3927 stage_is_shader(const VkPipelineStageFlags2 stage)
3928 {
3929 return (stage & (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
3930 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
3931 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
3932 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
3933 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
3934 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
3935 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
3936 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3937 VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
3938 VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
3939 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT));
3940 }
3941
3942 static inline bool
stage_is_transfer(const VkPipelineStageFlags2 stage)3943 stage_is_transfer(const VkPipelineStageFlags2 stage)
3944 {
3945 return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3946 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT));
3947 }
3948
3949 static inline bool
stage_is_video(const VkPipelineStageFlags2 stage)3950 stage_is_video(const VkPipelineStageFlags2 stage)
3951 {
3952 return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3953 #ifdef VK_ENABLE_BETA_EXTENSIONS
3954 VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR |
3955 #endif
3956 VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR));
3957 }
3958
3959 static inline bool
mask_is_shader_write(const VkAccessFlags2 access)3960 mask_is_shader_write(const VkAccessFlags2 access)
3961 {
3962 return (access & (VK_ACCESS_2_SHADER_WRITE_BIT |
3963 VK_ACCESS_2_MEMORY_WRITE_BIT |
3964 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT));
3965 }
3966
3967 static inline bool
mask_is_write(const VkAccessFlags2 access)3968 mask_is_write(const VkAccessFlags2 access)
3969 {
3970 return access & (VK_ACCESS_2_SHADER_WRITE_BIT |
3971 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
3972 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
3973 VK_ACCESS_2_TRANSFER_WRITE_BIT |
3974 VK_ACCESS_2_HOST_WRITE_BIT |
3975 VK_ACCESS_2_MEMORY_WRITE_BIT |
3976 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
3977 VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR |
3978 #ifdef VK_ENABLE_BETA_EXTENSIONS
3979 VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR |
3980 #endif
3981 VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
3982 VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
3983 VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV |
3984 VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR |
3985 VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT |
3986 VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV);
3987 }
3988
3989 static inline bool
mask_is_transfer_write(const VkAccessFlags2 access)3990 mask_is_transfer_write(const VkAccessFlags2 access)
3991 {
3992 return access & (VK_ACCESS_2_TRANSFER_WRITE_BIT |
3993 VK_ACCESS_2_MEMORY_WRITE_BIT);
3994 }
3995
3996 static void
cmd_buffer_barrier_video(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos)3997 cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
3998 uint32_t n_dep_infos,
3999 const VkDependencyInfo *dep_infos)
4000 {
4001 assert(anv_cmd_buffer_is_video_queue(cmd_buffer));
4002
4003 bool flush_llc = false;
4004 bool flush_ccs = false;
4005
4006 for (uint32_t d = 0; d < n_dep_infos; d++) {
4007 const VkDependencyInfo *dep_info = &dep_infos[d];
4008
4009
4010 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4011 const VkImageMemoryBarrier2 *img_barrier =
4012 &dep_info->pImageMemoryBarriers[i];
4013
4014 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4015 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4016
4017 /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4018 * memory barrier defines a queue family ownership transfer.
4019 */
4020 if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
4021 flush_llc = true;
4022
4023 VkImageAspectFlags img_aspects =
4024 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4025 anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
4026 const uint32_t plane =
4027 anv_image_aspect_to_plane(image, 1UL << aspect_bit);
4028 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
4029 flush_ccs = true;
4030 }
4031 }
4032 }
4033
4034 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4035 /* Flush the cache if something is written by the video operations and
4036 * used by any other stages except video encode/decode stages or if
4037 * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
4038 * barrier defines a queue family ownership transfer.
4039 */
4040 if ((stage_is_video(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
4041 mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask) &&
4042 !stage_is_video(dep_info->pBufferMemoryBarriers[i].dstStageMask)) ||
4043 (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
4044 dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
4045 flush_llc = true;
4046 break;
4047 }
4048 }
4049
4050 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4051 /* Flush the cache if something is written by the video operations and
4052 * used by any other stages except video encode/decode stage.
4053 */
4054 if (stage_is_video(dep_info->pMemoryBarriers[i].srcStageMask) &&
4055 mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4056 !stage_is_video(dep_info->pMemoryBarriers[i].dstStageMask)) {
4057 flush_llc = true;
4058 break;
4059 }
4060 }
4061
4062 /* We cannot gather more information than that. */
4063 if (flush_ccs && flush_llc)
4064 break;
4065 }
4066
4067 if (flush_ccs || flush_llc) {
4068 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
4069 #if GFX_VERx10 >= 125
4070 fd.FlushCCS = flush_ccs;
4071 #endif
4072 #if GFX_VER >= 12
4073 /* Using this bit on Gfx9 triggers a GPU hang.
4074 * This is undocumented behavior. Gfx12 seems fine.
4075 * TODO: check Gfx11
4076 */
4077 fd.FlushLLC = flush_llc;
4078 #endif
4079 }
4080 }
4081 }
4082
4083 static void
cmd_buffer_barrier_blitter(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos)4084 cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
4085 uint32_t n_dep_infos,
4086 const VkDependencyInfo *dep_infos)
4087 {
4088 #if GFX_VERx10 >= 125
4089 assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
4090
4091 /* The blitter requires an MI_FLUSH_DW command when a buffer transitions
4092 * from being a destination to a source.
4093 */
4094 bool flush_llc = false;
4095 bool flush_ccs = false;
4096
4097 for (uint32_t d = 0; d < n_dep_infos; d++) {
4098 const VkDependencyInfo *dep_info = &dep_infos[d];
4099
4100 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4101 const VkImageMemoryBarrier2 *img_barrier =
4102 &dep_info->pImageMemoryBarriers[i];
4103
4104 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4105 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4106
4107 /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4108 * memory barrier defines a queue family transfer operation.
4109 */
4110 if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
4111 flush_llc = true;
4112
4113 /* Flush cache if transfer command reads the output of the previous
4114 * transfer command, ideally we should just wait for the completion
4115 * but for now just flush the cache to make the data visible.
4116 */
4117 if ((img_barrier->oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ||
4118 img_barrier->oldLayout == VK_IMAGE_LAYOUT_GENERAL) &&
4119 (img_barrier->newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL ||
4120 img_barrier->newLayout == VK_IMAGE_LAYOUT_GENERAL)) {
4121 flush_llc = true;
4122 }
4123
4124 VkImageAspectFlags img_aspects =
4125 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4126 anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
4127 const uint32_t plane =
4128 anv_image_aspect_to_plane(image, 1UL << aspect_bit);
4129 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
4130 flush_ccs = true;
4131 }
4132 }
4133 }
4134
4135 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4136 /* Flush the cache if something is written by the transfer command
4137 * and used by any other stages except transfer stage or if
4138 * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
4139 * memory barrier defines a queue family transfer operation.
4140 */
4141 if ((stage_is_transfer(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
4142 mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask)) ||
4143 (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
4144 dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
4145 flush_llc = true;
4146 break;
4147 }
4148 }
4149
4150 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4151 /* Flush the cache if something is written by the transfer command
4152 * and used by any other stages except transfer stage.
4153 */
4154 if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
4155 mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask)) {
4156 flush_llc = true;
4157 break;
4158 }
4159 }
4160
4161 /* We cannot gather more information than that. */
4162 if (flush_ccs && flush_llc)
4163 break;
4164 }
4165
4166 if (flush_ccs || flush_llc) {
4167 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
4168 if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
4169 genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
4170 cmd_buffer->device);
4171 }
4172 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
4173 fd.FlushCCS = flush_ccs;
4174 fd.FlushLLC = flush_llc;
4175 }
4176 }
4177 #endif
4178 }
4179
4180 static inline bool
cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer * cmd_buffer)4181 cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer *cmd_buffer)
4182 {
4183 /* Query copies are only written with dataport, so we only need to check
4184 * that flag.
4185 */
4186 return (cmd_buffer->state.queries.buffer_write_bits &
4187 ANV_QUERY_WRITES_DATA_FLUSH) != 0;
4188 }
4189
4190 static void
cmd_buffer_accumulate_barrier_bits(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos,VkPipelineStageFlags2 * out_src_stages,VkPipelineStageFlags2 * out_dst_stages,enum anv_pipe_bits * out_bits)4191 cmd_buffer_accumulate_barrier_bits(struct anv_cmd_buffer *cmd_buffer,
4192 uint32_t n_dep_infos,
4193 const VkDependencyInfo *dep_infos,
4194 VkPipelineStageFlags2 *out_src_stages,
4195 VkPipelineStageFlags2 *out_dst_stages,
4196 enum anv_pipe_bits *out_bits)
4197 {
4198 /* XXX: Right now, we're really dumb and just flush whatever categories
4199 * the app asks for. One of these days we may make this a bit better but
4200 * right now that's all the hardware allows for in most areas.
4201 */
4202 VkAccessFlags2 src_flags = 0;
4203 VkAccessFlags2 dst_flags = 0;
4204
4205 VkPipelineStageFlags2 src_stages = 0;
4206 VkPipelineStageFlags2 dst_stages = 0;
4207
4208 #if GFX_VER < 20
4209 bool apply_sparse_flushes = false;
4210 struct anv_device *device = cmd_buffer->device;
4211 #endif
4212 bool flush_query_copies = false;
4213
4214 for (uint32_t d = 0; d < n_dep_infos; d++) {
4215 const VkDependencyInfo *dep_info = &dep_infos[d];
4216
4217 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
4218 src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
4219 dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
4220
4221 src_stages |= dep_info->pMemoryBarriers[i].srcStageMask;
4222 dst_stages |= dep_info->pMemoryBarriers[i].dstStageMask;
4223
4224 /* Shader writes to buffers that could then be written by a transfer
4225 * command (including queries).
4226 */
4227 if (stage_is_shader(dep_info->pMemoryBarriers[i].srcStageMask) &&
4228 mask_is_shader_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4229 stage_is_transfer(dep_info->pMemoryBarriers[i].dstStageMask)) {
4230 cmd_buffer->state.queries.buffer_write_bits |=
4231 ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
4232 }
4233
4234 if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
4235 mask_is_transfer_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
4236 cmd_buffer_has_pending_copy_query(cmd_buffer))
4237 flush_query_copies = true;
4238
4239 #if GFX_VER < 20
4240 /* There's no way of knowing if this memory barrier is related to
4241 * sparse buffers! This is pretty horrible.
4242 */
4243 if (mask_is_write(src_flags) &&
4244 p_atomic_read(&device->num_sparse_resources) > 0)
4245 apply_sparse_flushes = true;
4246 #endif
4247 }
4248
4249 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
4250 const VkBufferMemoryBarrier2 *buf_barrier =
4251 &dep_info->pBufferMemoryBarriers[i];
4252
4253 src_flags |= buf_barrier->srcAccessMask;
4254 dst_flags |= buf_barrier->dstAccessMask;
4255
4256 src_stages |= buf_barrier->srcStageMask;
4257 dst_stages |= buf_barrier->dstStageMask;
4258
4259 /* Shader writes to buffers that could then be written by a transfer
4260 * command (including queries).
4261 */
4262 if (stage_is_shader(buf_barrier->srcStageMask) &&
4263 mask_is_shader_write(buf_barrier->srcAccessMask) &&
4264 stage_is_transfer(buf_barrier->dstStageMask)) {
4265 cmd_buffer->state.queries.buffer_write_bits |=
4266 ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
4267 }
4268
4269 if (stage_is_transfer(buf_barrier->srcStageMask) &&
4270 mask_is_transfer_write(buf_barrier->srcAccessMask) &&
4271 cmd_buffer_has_pending_copy_query(cmd_buffer))
4272 flush_query_copies = true;
4273
4274 #if GFX_VER < 20
4275 ANV_FROM_HANDLE(anv_buffer, buffer, buf_barrier->buffer);
4276
4277 if (anv_buffer_is_sparse(buffer) && mask_is_write(src_flags))
4278 apply_sparse_flushes = true;
4279 #endif
4280 }
4281
4282 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
4283 const VkImageMemoryBarrier2 *img_barrier =
4284 &dep_info->pImageMemoryBarriers[i];
4285
4286 src_flags |= img_barrier->srcAccessMask;
4287 dst_flags |= img_barrier->dstAccessMask;
4288
4289 src_stages |= img_barrier->srcStageMask;
4290 dst_stages |= img_barrier->dstStageMask;
4291
4292 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
4293 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
4294
4295 uint32_t base_layer, layer_count;
4296 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
4297 base_layer = 0;
4298 layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
4299 } else {
4300 base_layer = range->baseArrayLayer;
4301 layer_count = vk_image_subresource_layer_count(&image->vk, range);
4302 }
4303 const uint32_t level_count =
4304 vk_image_subresource_level_count(&image->vk, range);
4305
4306 VkImageLayout old_layout = img_barrier->oldLayout;
4307 VkImageLayout new_layout = img_barrier->newLayout;
4308
4309 /* If we're inside a render pass, the runtime might have converted
4310 * some layouts from GENERAL to FEEDBACK_LOOP. Check if that's the
4311 * case and reconvert back to the original layout so that application
4312 * barriers within renderpass are operating with consistent layouts.
4313 */
4314 if (!cmd_buffer->vk.runtime_rp_barrier &&
4315 cmd_buffer->vk.render_pass != NULL) {
4316 assert(anv_cmd_graphics_state_has_image_as_attachment(&cmd_buffer->state.gfx,
4317 image));
4318 VkImageLayout subpass_att_layout, subpass_stencil_att_layout;
4319
4320 vk_command_buffer_get_attachment_layout(
4321 &cmd_buffer->vk, &image->vk,
4322 &subpass_att_layout, &subpass_stencil_att_layout);
4323
4324 old_layout = subpass_att_layout;
4325 new_layout = subpass_att_layout;
4326 }
4327
4328 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4329 transition_depth_buffer(cmd_buffer, image,
4330 range->baseMipLevel, level_count,
4331 base_layer, layer_count,
4332 old_layout, new_layout,
4333 false /* will_full_fast_clear */);
4334 }
4335
4336 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4337 transition_stencil_buffer(cmd_buffer, image,
4338 range->baseMipLevel, level_count,
4339 base_layer, layer_count,
4340 old_layout, new_layout,
4341 false /* will_full_fast_clear */);
4342 }
4343
4344 if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
4345 VkImageAspectFlags color_aspects =
4346 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4347 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
4348 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
4349 range->baseMipLevel, level_count,
4350 base_layer, layer_count,
4351 old_layout, new_layout,
4352 img_barrier->srcQueueFamilyIndex,
4353 img_barrier->dstQueueFamilyIndex,
4354 false /* will_full_fast_clear */);
4355 }
4356 }
4357 #if GFX_VER < 20
4358 /* Mark image as compressed if the destination layout has untracked
4359 * writes to the aux surface.
4360 */
4361 VkImageAspectFlags aspects =
4362 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
4363 anv_foreach_image_aspect_bit(aspect_bit, image, aspects) {
4364 VkImageAspectFlagBits aspect = 1UL << aspect_bit;
4365 if (anv_layout_has_untracked_aux_writes(
4366 device->info,
4367 image, aspect,
4368 img_barrier->newLayout,
4369 cmd_buffer->queue_family->queueFlags)) {
4370 for (uint32_t l = 0; l < level_count; l++) {
4371 const uint32_t level = range->baseMipLevel + l;
4372 const uint32_t aux_layers =
4373 anv_image_aux_layers(image, aspect, level);
4374
4375 if (base_layer >= aux_layers)
4376 break; /* We will only get fewer layers as level increases */
4377
4378 uint32_t level_layer_count =
4379 MIN2(layer_count, aux_layers - base_layer);
4380
4381 set_image_compressed_bit(cmd_buffer, image, aspect,
4382 level,
4383 base_layer, level_layer_count,
4384 true);
4385 }
4386 }
4387 }
4388
4389 if (anv_image_is_sparse(image) && mask_is_write(src_flags))
4390 apply_sparse_flushes = true;
4391 #endif
4392 }
4393 }
4394
4395 enum anv_pipe_bits bits =
4396 anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags) |
4397 anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags);
4398
4399 /* What stage require a stall at pixel scoreboard */
4400 VkPipelineStageFlags2 pb_stall_stages =
4401 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
4402 VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
4403 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4404 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
4405 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
4406 if (anv_cmd_buffer_is_render_queue(cmd_buffer)) {
4407 /* On a render queue, the following stages can also use a pixel shader.
4408 */
4409 pb_stall_stages |=
4410 VK_PIPELINE_STAGE_2_TRANSFER_BIT |
4411 VK_PIPELINE_STAGE_2_RESOLVE_BIT |
4412 VK_PIPELINE_STAGE_2_BLIT_BIT |
4413 VK_PIPELINE_STAGE_2_CLEAR_BIT;
4414 }
4415 VkPipelineStageFlags2 cs_stall_stages =
4416 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
4417 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
4418 VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
4419 VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
4420 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
4421 if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
4422 /* On a compute queue, the following stages can also use a compute
4423 * shader.
4424 */
4425 cs_stall_stages |=
4426 VK_PIPELINE_STAGE_2_TRANSFER_BIT |
4427 VK_PIPELINE_STAGE_2_RESOLVE_BIT |
4428 VK_PIPELINE_STAGE_2_BLIT_BIT |
4429 VK_PIPELINE_STAGE_2_CLEAR_BIT;
4430 } else if (anv_cmd_buffer_is_render_queue(cmd_buffer) &&
4431 cmd_buffer->state.current_pipeline == GPGPU) {
4432 /* In GPGPU mode, the render queue can also use a compute shader for
4433 * transfer operations.
4434 */
4435 cs_stall_stages |= VK_PIPELINE_STAGE_2_TRANSFER_BIT;
4436 }
4437
4438 /* Prior to Gfx20, we can restrict pb-stall/cs-stall to some pipeline
4439 * modes. Gfx20 doesn't do pipeline switches so we have to assume the worse
4440 * case.
4441 */
4442 const bool needs_pb_stall =
4443 anv_cmd_buffer_is_render_queue(cmd_buffer) &&
4444 #if GFX_VER < 20
4445 cmd_buffer->state.current_pipeline == _3D &&
4446 #endif
4447 (src_stages & pb_stall_stages);
4448 if (needs_pb_stall) {
4449 bits |= GFX_VERx10 >= 125 ?
4450 ANV_PIPE_PSS_STALL_SYNC_BIT :
4451 ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
4452 }
4453 const bool needs_cs_stall =
4454 anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer) &&
4455 #if GFX_VER < 20
4456 cmd_buffer->state.current_pipeline == GPGPU &&
4457 #endif
4458 (src_stages & cs_stall_stages);
4459 if (needs_cs_stall)
4460 bits |= ANV_PIPE_CS_STALL_BIT;
4461
4462 #if GFX_VER < 20
4463 /* Our HW implementation of the sparse feature prior to Xe2 lives in the
4464 * GAM unit (interface between all the GPU caches and external memory).
4465 * As a result writes to NULL bound images & buffers that should be
4466 * ignored are actually still visible in the caches. The only way for us
4467 * to get correct NULL bound regions to return 0s is to evict the caches
4468 * to force the caches to be repopulated with 0s.
4469 *
4470 * Our understanding is that Xe2 started to tag the L3 cache with some
4471 * kind physical address information rather. It is therefore able to
4472 * detect that a cache line in the cache is going to a null tile and so
4473 * the L3 cache also has a sparse compatible behavior and we don't need
4474 * to flush anymore.
4475 */
4476 if (apply_sparse_flushes)
4477 bits |= ANV_PIPE_BARRIER_FLUSH_BITS;
4478 #endif
4479
4480 /* Copies from query pools are executed with a shader writing through the
4481 * dataport.
4482 */
4483 if (flush_query_copies) {
4484 bits |= (GFX_VER >= 12 ?
4485 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
4486 }
4487
4488 if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
4489 genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
4490
4491 *out_src_stages = src_stages;
4492 *out_dst_stages = dst_stages;
4493 *out_bits = bits;
4494 }
4495
4496 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,uint32_t n_dep_infos,const VkDependencyInfo * dep_infos,const char * reason)4497 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
4498 uint32_t n_dep_infos,
4499 const VkDependencyInfo *dep_infos,
4500 const char *reason)
4501 {
4502 switch (cmd_buffer->batch.engine_class) {
4503 case INTEL_ENGINE_CLASS_VIDEO:
4504 cmd_buffer_barrier_video(cmd_buffer, n_dep_infos, dep_infos);
4505 break;
4506
4507 case INTEL_ENGINE_CLASS_COPY:
4508 cmd_buffer_barrier_blitter(cmd_buffer, n_dep_infos, dep_infos);
4509 break;
4510
4511 case INTEL_ENGINE_CLASS_RENDER:
4512 case INTEL_ENGINE_CLASS_COMPUTE: {
4513 VkPipelineStageFlags2 src_stages, dst_stages;
4514 enum anv_pipe_bits bits;
4515 cmd_buffer_accumulate_barrier_bits(cmd_buffer, n_dep_infos, dep_infos,
4516 &src_stages, &dst_stages, &bits);
4517
4518 anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
4519 break;
4520 }
4521
4522 default:
4523 unreachable("Invalid engine class");
4524 }
4525 }
4526
genX(CmdPipelineBarrier2)4527 void genX(CmdPipelineBarrier2)(
4528 VkCommandBuffer commandBuffer,
4529 const VkDependencyInfo* pDependencyInfo)
4530 {
4531 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4532
4533 cmd_buffer_barrier(cmd_buffer, 1, pDependencyInfo, "pipe barrier");
4534 }
4535
4536 void
genX(batch_emit_breakpoint)4537 genX(batch_emit_breakpoint)(struct anv_batch *batch,
4538 struct anv_device *device,
4539 bool emit_before_draw)
4540 {
4541 /* Update draw call count once */
4542 uint32_t draw_count = emit_before_draw ?
4543 p_atomic_inc_return(&device->draw_call_count) :
4544 p_atomic_read(&device->draw_call_count);
4545
4546 if (((draw_count == intel_debug_bkp_before_draw_count &&
4547 emit_before_draw) ||
4548 (draw_count == intel_debug_bkp_after_draw_count &&
4549 !emit_before_draw))) {
4550 struct anv_address wait_addr =
4551 anv_state_pool_state_address(&device->dynamic_state_pool,
4552 device->breakpoint);
4553
4554 anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
4555 sem.WaitMode = PollingMode;
4556 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
4557 sem.SemaphoreDataDword = 0x1;
4558 sem.SemaphoreAddress = wait_addr;
4559 };
4560 }
4561 }
4562
4563 /* Only emit PIPELINE_SELECT, for the whole mode switch and flushing use
4564 * flush_pipeline_select()
4565 */
4566 void
genX(emit_pipeline_select)4567 genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
4568 const struct anv_device *device)
4569 {
4570 /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
4571 #if GFX_VER < 20
4572 anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) {
4573 ps.MaskBits = GFX_VERx10 >= 125 ? 0x93 : GFX_VER >= 12 ? 0x13 : 0x3;
4574 #if GFX_VER == 12
4575 ps.MediaSamplerDOPClockGateEnable = true;
4576 #endif
4577 ps.PipelineSelection = pipeline;
4578 #if GFX_VERx10 == 125
4579 /* It might still be better to only enable this when the compute
4580 * pipeline will have DPAS instructions.
4581 */
4582 ps.SystolicModeEnable = pipeline == GPGPU &&
4583 device->vk.enabled_extensions.KHR_cooperative_matrix &&
4584 device->vk.enabled_features.cooperativeMatrix;
4585 #endif
4586 }
4587 #endif /* if GFX_VER < 20 */
4588 }
4589
4590 static void
genX(flush_pipeline_select)4591 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
4592 uint32_t pipeline)
4593 {
4594 UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
4595
4596 if (cmd_buffer->state.current_pipeline == pipeline)
4597 return;
4598
4599 #if GFX_VER >= 20
4600 /* While PIPELINE_SELECT is not needed on Xe2+, our current assumption
4601 * is that the pipelined flushes in the 3D pipeline are not getting
4602 * synchronized with the compute dispatches (and vice versa). So we need
4603 * a CS_STALL prior the next set of commands to ensure the flushes have
4604 * completed.
4605 *
4606 * The new RESOURCE_BARRIER instruction has support for synchronizing
4607 * 3D/Compute and once we switch to that we should be able to get rid of
4608 * this CS_STALL.
4609 */
4610 anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, "pipeline switch stall");
4611
4612 /* Since we are not stalling/flushing caches explicitly while switching
4613 * between the pipelines, we need to apply data dependency flushes recorded
4614 * previously on the resource.
4615 */
4616 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4617 #else
4618
4619 #if GFX_VER == 9
4620 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
4621 *
4622 * Software must clear the COLOR_CALC_STATE Valid field in
4623 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
4624 * with Pipeline Select set to GPGPU.
4625 *
4626 * The internal hardware docs recommend the same workaround for Gfx9
4627 * hardware too.
4628 */
4629 if (pipeline == GPGPU)
4630 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
4631 #endif
4632
4633 #if GFX_VERx10 == 120
4634 /* Undocumented workaround to force the re-emission of
4635 * MEDIA_INTERFACE_DESCRIPTOR_LOAD when switching from 3D to Compute
4636 * pipeline without rebinding a pipeline :
4637 * vkCmdBindPipeline(COMPUTE, cs_pipeline);
4638 * vkCmdDispatch(...);
4639 * vkCmdBindPipeline(GRAPHICS, gfx_pipeline);
4640 * vkCmdDraw(...);
4641 * vkCmdDispatch(...);
4642 */
4643 if (pipeline == _3D)
4644 cmd_buffer->state.compute.pipeline_dirty = true;
4645 #endif
4646
4647 /* We apparently cannot flush the tile cache (color/depth) from the GPGPU
4648 * pipeline. That means query clears will not be visible to query
4649 * copy/write. So we need to flush it before going to GPGPU mode.
4650 */
4651 if (cmd_buffer->state.current_pipeline == _3D &&
4652 cmd_buffer->state.queries.clear_bits) {
4653 anv_add_pending_pipe_bits(cmd_buffer,
4654 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
4655 "query clear flush prior to GPGPU");
4656 }
4657
4658 /* Flush and invalidate bits done needed prior PIPELINE_SELECT. */
4659 enum anv_pipe_bits bits = 0;
4660
4661 #if GFX_VER >= 12
4662 /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
4663 *
4664 * "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
4665 * are flushed through a stalling PIPE_CONTROL command prior to
4666 * programming of PIPELINE_SELECT command transitioning Pipeline Select
4667 * from 3D to GPGPU/Media.
4668 * Software must ensure HDC Pipeline flush and Generic Media State Clear
4669 * is issued through a stalling PIPE_CONTROL command prior to programming
4670 * of PIPELINE_SELECT command transitioning Pipeline Select from
4671 * GPGPU/Media to 3D."
4672 *
4673 * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
4674 * because PIPE was not in MEDIA mode?!
4675 */
4676 bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
4677
4678 if (cmd_buffer->state.current_pipeline == _3D) {
4679 bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4680 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
4681 } else {
4682 bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
4683 }
4684 #else
4685 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
4686 * PIPELINE_SELECT [DevBWR+]":
4687 *
4688 * Project: DEVSNB+
4689 *
4690 * Software must ensure all the write caches are flushed through a
4691 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
4692 * command to invalidate read only caches prior to programming
4693 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
4694 *
4695 * Note the cmd_buffer_apply_pipe_flushes will split this into two
4696 * PIPE_CONTROLs.
4697 */
4698 bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4699 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4700 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
4701 ANV_PIPE_CS_STALL_BIT |
4702 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4703 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
4704 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
4705 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
4706 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
4707 #endif
4708
4709 /* Wa_16013063087 - State Cache Invalidate must be issued prior to
4710 * PIPELINE_SELECT when switching from 3D to Compute.
4711 *
4712 * SW must do this by programming of PIPECONTROL with “CS Stall” followed by
4713 * a PIPECONTROL with State Cache Invalidate bit set.
4714 *
4715 */
4716 if (cmd_buffer->state.current_pipeline == _3D && pipeline == GPGPU &&
4717 intel_needs_workaround(cmd_buffer->device->info, 16013063087))
4718 bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
4719
4720 anv_add_pending_pipe_bits(cmd_buffer, bits, "flush/invalidate PIPELINE_SELECT");
4721 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4722
4723 #if GFX_VER == 9
4724 if (pipeline == _3D) {
4725 /* There is a mid-object preemption workaround which requires you to
4726 * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However,
4727 * even without preemption, we have issues with geometry flickering when
4728 * GPGPU and 3D are back-to-back and this seems to fix it. We don't
4729 * really know why.
4730 *
4731 * Also, from the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
4732 *
4733 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4734 * the only bits that are changed are scoreboard related ..."
4735 *
4736 * This is satisfied by applying pre-PIPELINE_SELECT pipe flushes above.
4737 */
4738 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
4739 vfe.MaximumNumberofThreads =
4740 devinfo->max_cs_threads * devinfo->subslice_total - 1;
4741 vfe.NumberofURBEntries = 2;
4742 vfe.URBEntryAllocationSize = 2;
4743 }
4744
4745 /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
4746 * invalid. Set the compute pipeline to dirty to force a re-emit of the
4747 * pipeline in case we get back-to-back dispatch calls with the same
4748 * pipeline and a PIPELINE_SELECT in between.
4749 */
4750 cmd_buffer->state.compute.pipeline_dirty = true;
4751 }
4752 #endif
4753
4754 genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline, cmd_buffer->device);
4755
4756 #if GFX_VER == 9
4757 if (devinfo->platform == INTEL_PLATFORM_GLK) {
4758 /* Project: DevGLK
4759 *
4760 * "This chicken bit works around a hardware issue with barrier logic
4761 * encountered when switching between GPGPU and 3D pipelines. To
4762 * workaround the issue, this mode bit should be set after a pipeline
4763 * is selected."
4764 */
4765 anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
4766 scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
4767 : GLK_BARRIER_MODE_3D_HULL;
4768 scec1.GLKBarrierModeMask = 1;
4769 }
4770 }
4771 #endif
4772
4773 #if GFX_VER == 9
4774 /* Undocumented workaround, we need to reemit MEDIA_CURBE_LOAD on Gfx9 when
4775 * switching from 3D->GPGPU, otherwise the shader gets corrupted push
4776 * constants. Note that this doesn't trigger a push constant reallocation,
4777 * we just reprogram the same pointer.
4778 *
4779 * The issue reproduces pretty much 100% on
4780 * dEQP-VK.memory_model.transitive.* tests. Reducing the number of
4781 * iteration in the test from 50 to < 10 makes the tests flaky.
4782 */
4783 if (pipeline == GPGPU)
4784 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4785 #endif
4786 #endif /* else of if GFX_VER >= 20 */
4787 cmd_buffer->state.current_pipeline = pipeline;
4788 }
4789
4790 void
genX(flush_pipeline_select_3d)4791 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
4792 {
4793 genX(flush_pipeline_select)(cmd_buffer, _3D);
4794 }
4795
4796 void
genX(flush_pipeline_select_gpgpu)4797 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
4798 {
4799 genX(flush_pipeline_select)(cmd_buffer, GPGPU);
4800 }
4801
4802 void
genX(cmd_buffer_emit_gfx12_depth_wa)4803 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
4804 const struct isl_surf *surf)
4805 {
4806 #if INTEL_NEEDS_WA_1808121037
4807 const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
4808 surf->samples == 1;
4809
4810 switch (cmd_buffer->state.gfx.depth_reg_mode) {
4811 case ANV_DEPTH_REG_MODE_HW_DEFAULT:
4812 if (!is_d16_1x_msaa)
4813 return;
4814 break;
4815 case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
4816 if (is_d16_1x_msaa)
4817 return;
4818 break;
4819 case ANV_DEPTH_REG_MODE_UNKNOWN:
4820 break;
4821 }
4822
4823 /* We'll change some CHICKEN registers depending on the depth surface
4824 * format. Do a depth flush and stall so the pipeline is not using these
4825 * settings while we change the registers.
4826 */
4827 anv_add_pending_pipe_bits(cmd_buffer,
4828 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4829 ANV_PIPE_DEPTH_STALL_BIT |
4830 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
4831 "Workaround: Stop pipeline for 1808121037");
4832 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4833
4834 /* Wa_1808121037
4835 *
4836 * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
4837 * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
4838 */
4839 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
4840 reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
4841 reg.HIZPlaneOptimizationdisablebitMask = true;
4842 }
4843
4844 cmd_buffer->state.gfx.depth_reg_mode =
4845 is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
4846 ANV_DEPTH_REG_MODE_HW_DEFAULT;
4847 #endif
4848 }
4849
4850 #if GFX_VER == 9
4851 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
4852 *
4853 * "The VF cache needs to be invalidated before binding and then using
4854 * Vertex Buffers that overlap with any previously bound Vertex Buffer
4855 * (at a 64B granularity) since the last invalidation. A VF cache
4856 * invalidate is performed by setting the "VF Cache Invalidation Enable"
4857 * bit in PIPE_CONTROL."
4858 *
4859 * This is implemented by carefully tracking all vertex and index buffer
4860 * bindings and flushing if the cache ever ends up with a range in the cache
4861 * that would exceed 4 GiB. This is implemented in three parts:
4862 *
4863 * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
4864 * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
4865 * tracking code of the new binding. If this new binding would cause
4866 * the cache to have a too-large range on the next draw call, a pipeline
4867 * stall and VF cache invalidate are added to pending_pipeline_bits.
4868 *
4869 * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
4870 * empty whenever we emit a VF invalidate.
4871 *
4872 * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
4873 * after every 3DPRIMITIVE and copies the bound range into the dirty
4874 * range for each used buffer. This has to be a separate step because
4875 * we don't always re-bind all buffers and so 1. can't know which
4876 * buffers are actually bound.
4877 */
4878 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)4879 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4880 int vb_index,
4881 struct anv_address vb_address,
4882 uint32_t vb_size)
4883 {
4884 if (GFX_VER > 9)
4885 return;
4886
4887 struct anv_vb_cache_range *bound, *dirty;
4888 if (vb_index == -1) {
4889 bound = &cmd_buffer->state.gfx.ib_bound_range;
4890 dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4891 } else {
4892 assert(vb_index >= 0);
4893 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4894 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4895 bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4896 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4897 }
4898
4899 if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4900 vb_address,
4901 vb_size)) {
4902 anv_add_pending_pipe_bits(cmd_buffer,
4903 ANV_PIPE_CS_STALL_BIT |
4904 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4905 "vb > 32b range");
4906 }
4907 }
4908
4909 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4910 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4911 uint32_t access_type,
4912 uint64_t vb_used)
4913 {
4914 if (access_type == RANDOM) {
4915 /* We have an index buffer */
4916 struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4917 struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4918
4919 anv_merge_vb_cache_range(dirty, bound);
4920 }
4921
4922 uint64_t mask = vb_used;
4923 while (mask) {
4924 int i = u_bit_scan64(&mask);
4925 assert(i >= 0);
4926 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4927 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4928
4929 struct anv_vb_cache_range *bound, *dirty;
4930 bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4931 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4932
4933 anv_merge_vb_cache_range(dirty, bound);
4934 }
4935 }
4936 #endif /* GFX_VER == 9 */
4937
4938 /**
4939 * Update the pixel hashing modes that determine the balancing of PS threads
4940 * across subslices and slices.
4941 *
4942 * \param width Width bound of the rendering area (already scaled down if \p
4943 * scale is greater than 1).
4944 * \param height Height bound of the rendering area (already scaled down if \p
4945 * scale is greater than 1).
4946 * \param scale The number of framebuffer samples that could potentially be
4947 * affected by an individual channel of the PS thread. This is
4948 * typically one for single-sampled rendering, but for operations
4949 * like CCS resolves and fast clears a single PS invocation may
4950 * update a huge number of pixels, in which case a finer
4951 * balancing is desirable in order to maximally utilize the
4952 * bandwidth available. UINT_MAX can be used as shorthand for
4953 * "finest hashing mode available".
4954 */
4955 void
genX(cmd_buffer_emit_hashing_mode)4956 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
4957 unsigned width, unsigned height,
4958 unsigned scale)
4959 {
4960 #if GFX_VER == 9
4961 const struct intel_device_info *devinfo = cmd_buffer->device->info;
4962 const unsigned slice_hashing[] = {
4963 /* Because all Gfx9 platforms with more than one slice require
4964 * three-way subslice hashing, a single "normal" 16x16 slice hashing
4965 * block is guaranteed to suffer from substantial imbalance, with one
4966 * subslice receiving twice as much work as the other two in the
4967 * slice.
4968 *
4969 * The performance impact of that would be particularly severe when
4970 * three-way hashing is also in use for slice balancing (which is the
4971 * case for all Gfx9 GT4 platforms), because one of the slices
4972 * receives one every three 16x16 blocks in either direction, which
4973 * is roughly the periodicity of the underlying subslice imbalance
4974 * pattern ("roughly" because in reality the hardware's
4975 * implementation of three-way hashing doesn't do exact modulo 3
4976 * arithmetic, which somewhat decreases the magnitude of this effect
4977 * in practice). This leads to a systematic subslice imbalance
4978 * within that slice regardless of the size of the primitive. The
4979 * 32x32 hashing mode guarantees that the subslice imbalance within a
4980 * single slice hashing block is minimal, largely eliminating this
4981 * effect.
4982 */
4983 _32x32,
4984 /* Finest slice hashing mode available. */
4985 NORMAL
4986 };
4987 const unsigned subslice_hashing[] = {
4988 /* 16x16 would provide a slight cache locality benefit especially
4989 * visible in the sampler L1 cache efficiency of low-bandwidth
4990 * non-LLC platforms, but it comes at the cost of greater subslice
4991 * imbalance for primitives of dimensions approximately intermediate
4992 * between 16x4 and 16x16.
4993 */
4994 _16x4,
4995 /* Finest subslice hashing mode available. */
4996 _8x4
4997 };
4998 /* Dimensions of the smallest hashing block of a given hashing mode. If
4999 * the rendering area is smaller than this there can't possibly be any
5000 * benefit from switching to this mode, so we optimize out the
5001 * transition.
5002 */
5003 const unsigned min_size[][2] = {
5004 { 16, 4 },
5005 { 8, 4 }
5006 };
5007 const unsigned idx = scale > 1;
5008
5009 if (cmd_buffer->state.current_hash_scale != scale &&
5010 (width > min_size[idx][0] || height > min_size[idx][1])) {
5011 anv_add_pending_pipe_bits(cmd_buffer,
5012 ANV_PIPE_CS_STALL_BIT |
5013 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
5014 "change pixel hash mode");
5015 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5016
5017 anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
5018 gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
5019 gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
5020 gt.SubsliceHashing = subslice_hashing[idx];
5021 gt.SubsliceHashingMask = -1;
5022 }
5023
5024 cmd_buffer->state.current_hash_scale = scale;
5025 }
5026 #endif
5027 }
5028
5029 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)5030 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
5031 {
5032 struct anv_device *device = cmd_buffer->device;
5033 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5034
5035 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
5036 device->isl_dev.ds.size / 4);
5037 if (dw == NULL)
5038 return;
5039
5040 struct isl_view isl_view = {};
5041 struct isl_depth_stencil_hiz_emit_info info = {
5042 .view = &isl_view,
5043 .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
5044 };
5045
5046 if (gfx->depth_att.iview != NULL) {
5047 isl_view = gfx->depth_att.iview->planes[0].isl;
5048 } else if (gfx->stencil_att.iview != NULL) {
5049 isl_view = gfx->stencil_att.iview->planes[0].isl;
5050 }
5051
5052 if (gfx->view_mask) {
5053 assert(isl_view.array_len == 0 ||
5054 isl_view.array_len >= util_last_bit(gfx->view_mask));
5055 isl_view.array_len = util_last_bit(gfx->view_mask);
5056 } else {
5057 assert(isl_view.array_len == 0 ||
5058 isl_view.array_len >= util_last_bit(gfx->layer_count));
5059 isl_view.array_len = gfx->layer_count;
5060 }
5061
5062 if (gfx->depth_att.iview != NULL) {
5063 const struct anv_image_view *iview = gfx->depth_att.iview;
5064 const struct anv_image *image = iview->image;
5065
5066 const uint32_t depth_plane =
5067 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
5068 const struct anv_surface *depth_surface =
5069 &image->planes[depth_plane].primary_surface;
5070 const struct anv_address depth_address =
5071 anv_image_address(image, &depth_surface->memory_range);
5072
5073 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, depth_address.bo);
5074
5075 info.depth_surf = &depth_surface->isl;
5076 info.depth_address = anv_address_physical(depth_address);
5077 info.mocs =
5078 anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
5079
5080 info.hiz_usage = gfx->depth_att.aux_usage;
5081 if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
5082 assert(isl_aux_usage_has_hiz(info.hiz_usage));
5083
5084 const struct anv_surface *hiz_surface =
5085 &image->planes[depth_plane].aux_surface;
5086 const struct anv_address hiz_address =
5087 anv_image_address(image, &hiz_surface->memory_range);
5088
5089 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, hiz_address.bo);
5090
5091 info.hiz_surf = &hiz_surface->isl;
5092 info.hiz_address = anv_address_physical(hiz_address);
5093
5094 info.depth_clear_value = anv_image_hiz_clear_value(image).f32[0];
5095 }
5096 }
5097
5098 if (gfx->stencil_att.iview != NULL) {
5099 const struct anv_image_view *iview = gfx->stencil_att.iview;
5100 const struct anv_image *image = iview->image;
5101
5102 const uint32_t stencil_plane =
5103 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5104 const struct anv_surface *stencil_surface =
5105 &image->planes[stencil_plane].primary_surface;
5106 const struct anv_address stencil_address =
5107 anv_image_address(image, &stencil_surface->memory_range);
5108
5109 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, stencil_address.bo);
5110
5111 info.stencil_surf = &stencil_surface->isl;
5112
5113 info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
5114 info.stencil_address = anv_address_physical(stencil_address);
5115 info.mocs =
5116 anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
5117 }
5118
5119 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
5120
5121 if (intel_needs_workaround(cmd_buffer->device->info, 1408224581) ||
5122 intel_needs_workaround(cmd_buffer->device->info, 14014097488) ||
5123 intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
5124 /* Wa_1408224581
5125 *
5126 * Workaround: Gfx12LP Astep only An additional pipe control with
5127 * post-sync = store dword operation would be required.( w/a is to have
5128 * an additional pipe control after the stencil state whenever the
5129 * surface state bits of this state is changing).
5130 *
5131 * This also seems sufficient to handle Wa_14014097488 and
5132 * Wa_14016712196.
5133 */
5134 genx_batch_emit_pipe_control_write(&cmd_buffer->batch, device->info,
5135 cmd_buffer->state.current_pipeline,
5136 WriteImmediateData,
5137 device->workaround_address, 0, 0);
5138 }
5139
5140 if (info.depth_surf)
5141 genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
5142
5143 cmd_buffer->state.gfx.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
5144 }
5145
5146 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image_view * fsr_iview)5147 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
5148 const struct anv_image_view *fsr_iview)
5149 {
5150 #if GFX_VERx10 >= 125
5151 struct anv_device *device = cmd_buffer->device;
5152
5153 if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
5154 return;
5155
5156 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
5157 device->isl_dev.cpb.size / 4);
5158 if (dw == NULL)
5159 return;
5160
5161 struct isl_cpb_emit_info info = { };
5162
5163 if (fsr_iview) {
5164 const struct anv_image_binding *binding = &fsr_iview->image->bindings[0];
5165
5166 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, binding->address.bo);
5167
5168 struct anv_address addr =
5169 anv_address_add(binding->address, binding->memory_range.offset);
5170
5171 info.view = &fsr_iview->planes[0].isl;
5172 info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
5173 info.address = anv_address_physical(addr);
5174 info.mocs =
5175 anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
5176 ISL_SURF_USAGE_CPB_BIT);
5177 }
5178
5179 isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
5180
5181 /* Wa_14016712196:
5182 * Emit dummy pipe control after state that sends implicit depth flush.
5183 */
5184 if (intel_needs_workaround(device->info, 14016712196)) {
5185 genx_batch_emit_pipe_control_write(&cmd_buffer->batch, device->info,
5186 cmd_buffer->state.current_pipeline,
5187 WriteImmediateData,
5188 device->workaround_address, 0, 0);
5189 }
5190
5191 #endif /* GFX_VERx10 >= 125 */
5192 }
5193
5194 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)5195 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
5196 {
5197 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
5198 vk_find_struct_const(att->pNext,
5199 RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
5200 if (layout_info != NULL)
5201 return layout_info->initialLayout;
5202
5203 return att->imageLayout;
5204 }
5205
genX(CmdBeginRendering)5206 void genX(CmdBeginRendering)(
5207 VkCommandBuffer commandBuffer,
5208 const VkRenderingInfo* pRenderingInfo)
5209 {
5210 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5211 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5212 VkResult result;
5213
5214 if (!anv_cmd_buffer_is_render_queue(cmd_buffer)) {
5215 assert(!"Trying to start a render pass on non-render queue!");
5216 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
5217 return;
5218 }
5219
5220 anv_measure_beginrenderpass(cmd_buffer);
5221 trace_intel_begin_render_pass(&cmd_buffer->trace);
5222
5223 gfx->rendering_flags = pRenderingInfo->flags;
5224 gfx->view_mask = pRenderingInfo->viewMask;
5225 gfx->layer_count = pRenderingInfo->layerCount;
5226 gfx->samples = 0;
5227
5228 if (gfx->render_area.offset.x != pRenderingInfo->renderArea.offset.x ||
5229 gfx->render_area.offset.y != pRenderingInfo->renderArea.offset.y ||
5230 gfx->render_area.extent.width != pRenderingInfo->renderArea.extent.width ||
5231 gfx->render_area.extent.height != pRenderingInfo->renderArea.extent.height) {
5232 gfx->render_area = pRenderingInfo->renderArea;
5233 gfx->dirty |= ANV_CMD_DIRTY_RENDER_AREA;
5234 }
5235
5236 const bool is_multiview = gfx->view_mask != 0;
5237 const VkRect2D render_area = gfx->render_area;
5238 const uint32_t layers =
5239 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5240
5241 /* The framebuffer size is at least large enough to contain the render
5242 * area. Because a zero renderArea is possible, we MAX with 1.
5243 */
5244 struct isl_extent3d fb_size = {
5245 .w = MAX2(1, render_area.offset.x + render_area.extent.width),
5246 .h = MAX2(1, render_area.offset.y + render_area.extent.height),
5247 .d = layers,
5248 };
5249
5250 const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
5251
5252 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
5253 if (result != VK_SUCCESS)
5254 return;
5255
5256 genX(flush_pipeline_select_3d)(cmd_buffer);
5257
5258 UNUSED bool render_target_change = false;
5259 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5260 if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE) {
5261 render_target_change |= gfx->color_att[i].iview != NULL;
5262
5263 gfx->color_att[i].vk_format = VK_FORMAT_UNDEFINED;
5264 gfx->color_att[i].iview = NULL;
5265 gfx->color_att[i].layout = VK_IMAGE_LAYOUT_UNDEFINED;
5266 gfx->color_att[i].aux_usage = ISL_AUX_USAGE_NONE;
5267 continue;
5268 }
5269
5270 const VkRenderingAttachmentInfo *att =
5271 &pRenderingInfo->pColorAttachments[i];
5272 ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
5273 const VkImageLayout initial_layout = attachment_initial_layout(att);
5274
5275 assert(render_area.offset.x + render_area.extent.width <=
5276 iview->vk.extent.width);
5277 assert(render_area.offset.y + render_area.extent.height <=
5278 iview->vk.extent.height);
5279 assert(layers <= iview->vk.layer_count);
5280
5281 fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
5282 fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
5283
5284 assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
5285 gfx->samples |= iview->vk.image->samples;
5286
5287 enum isl_aux_usage aux_usage =
5288 anv_layout_to_aux_usage(cmd_buffer->device->info,
5289 iview->image,
5290 VK_IMAGE_ASPECT_COLOR_BIT,
5291 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
5292 att->imageLayout,
5293 cmd_buffer->queue_family->queueFlags);
5294
5295 render_target_change |= gfx->color_att[i].iview != iview;
5296
5297 gfx->color_att[i].vk_format = iview->vk.format;
5298 gfx->color_att[i].iview = iview;
5299 gfx->color_att[i].layout = att->imageLayout;
5300 gfx->color_att[i].aux_usage = aux_usage;
5301
5302 union isl_color_value fast_clear_color = { .u32 = { 0, } };
5303
5304 if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5305 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
5306 uint32_t clear_view_mask = pRenderingInfo->viewMask;
5307 VkClearRect clear_rect = {
5308 .rect = render_area,
5309 .baseArrayLayer = iview->vk.base_array_layer,
5310 .layerCount = layers,
5311 };
5312 const union isl_color_value clear_color =
5313 vk_to_isl_color_with_format(att->clearValue.color,
5314 iview->planes[0].isl.format);
5315
5316 /* We only support fast-clears on the first layer */
5317 const bool fast_clear =
5318 (!is_multiview || (gfx->view_mask & 1)) &&
5319 anv_can_fast_clear_color(cmd_buffer, iview->image,
5320 iview->vk.base_mip_level,
5321 &clear_rect, att->imageLayout,
5322 iview->planes[0].isl.format,
5323 clear_color);
5324
5325 if (att->imageLayout != initial_layout) {
5326 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5327 render_area.extent.width == iview->vk.extent.width &&
5328 render_area.extent.height == iview->vk.extent.height);
5329 if (is_multiview) {
5330 u_foreach_bit(view, gfx->view_mask) {
5331 transition_color_buffer(cmd_buffer, iview->image,
5332 VK_IMAGE_ASPECT_COLOR_BIT,
5333 iview->vk.base_mip_level, 1,
5334 iview->vk.base_array_layer + view,
5335 1, /* layer_count */
5336 initial_layout, att->imageLayout,
5337 VK_QUEUE_FAMILY_IGNORED,
5338 VK_QUEUE_FAMILY_IGNORED,
5339 fast_clear);
5340 }
5341 } else {
5342 transition_color_buffer(cmd_buffer, iview->image,
5343 VK_IMAGE_ASPECT_COLOR_BIT,
5344 iview->vk.base_mip_level, 1,
5345 iview->vk.base_array_layer,
5346 gfx->layer_count,
5347 initial_layout, att->imageLayout,
5348 VK_QUEUE_FAMILY_IGNORED,
5349 VK_QUEUE_FAMILY_IGNORED,
5350 fast_clear);
5351 }
5352 }
5353
5354 if (fast_clear) {
5355 /* We only support fast-clears on the first layer */
5356 assert(iview->vk.base_mip_level == 0 &&
5357 iview->vk.base_array_layer == 0);
5358
5359 fast_clear_color = clear_color;
5360
5361 if (iview->image->vk.samples == 1) {
5362 anv_image_ccs_op(cmd_buffer, iview->image,
5363 iview->planes[0].isl.format,
5364 iview->planes[0].isl.swizzle,
5365 VK_IMAGE_ASPECT_COLOR_BIT,
5366 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
5367 &fast_clear_color,
5368 false);
5369 } else {
5370 anv_image_mcs_op(cmd_buffer, iview->image,
5371 iview->planes[0].isl.format,
5372 iview->planes[0].isl.swizzle,
5373 VK_IMAGE_ASPECT_COLOR_BIT,
5374 0, 1, ISL_AUX_OP_FAST_CLEAR,
5375 &fast_clear_color,
5376 false);
5377 }
5378 clear_view_mask &= ~1u;
5379 clear_rect.baseArrayLayer++;
5380 clear_rect.layerCount--;
5381 #if GFX_VER < 20
5382 genX(set_fast_clear_state)(cmd_buffer, iview->image,
5383 iview->planes[0].isl.format,
5384 iview->planes[0].isl.swizzle,
5385 clear_color);
5386 #endif
5387 }
5388
5389 if (is_multiview) {
5390 u_foreach_bit(view, clear_view_mask) {
5391 anv_image_clear_color(cmd_buffer, iview->image,
5392 VK_IMAGE_ASPECT_COLOR_BIT,
5393 aux_usage,
5394 iview->planes[0].isl.format,
5395 iview->planes[0].isl.swizzle,
5396 iview->vk.base_mip_level,
5397 iview->vk.base_array_layer + view, 1,
5398 render_area, clear_color);
5399 }
5400 } else if (clear_rect.layerCount > 0) {
5401 anv_image_clear_color(cmd_buffer, iview->image,
5402 VK_IMAGE_ASPECT_COLOR_BIT,
5403 aux_usage,
5404 iview->planes[0].isl.format,
5405 iview->planes[0].isl.swizzle,
5406 iview->vk.base_mip_level,
5407 clear_rect.baseArrayLayer,
5408 clear_rect.layerCount,
5409 render_area, clear_color);
5410 }
5411 } else {
5412 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5413 assert(att->imageLayout == initial_layout);
5414 }
5415
5416 struct isl_view isl_view = iview->planes[0].isl;
5417 if (pRenderingInfo->viewMask) {
5418 assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
5419 isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
5420 } else {
5421 assert(isl_view.array_len >= pRenderingInfo->layerCount);
5422 isl_view.array_len = pRenderingInfo->layerCount;
5423 }
5424
5425 anv_image_fill_surface_state(cmd_buffer->device,
5426 iview->image,
5427 VK_IMAGE_ASPECT_COLOR_BIT,
5428 &isl_view,
5429 ISL_SURF_USAGE_RENDER_TARGET_BIT,
5430 aux_usage, &fast_clear_color,
5431 0, /* anv_image_view_state_flags */
5432 &gfx->color_att[i].surface_state);
5433
5434 add_surface_state_relocs(cmd_buffer, &gfx->color_att[i].surface_state);
5435
5436 if (GFX_VER < 10 &&
5437 (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
5438 render_area.extent.width != iview->vk.extent.width ||
5439 render_area.extent.height != iview->vk.extent.height ||
5440 (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
5441 iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
5442 iview->planes[0].isl.base_level == 0 &&
5443 iview->planes[0].isl.base_array_layer == 0) {
5444 struct anv_state surf_state = gfx->color_att[i].surface_state.state;
5445 genX(cmd_buffer_load_clear_color)(cmd_buffer, surf_state, iview);
5446 }
5447
5448 if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
5449 gfx->color_att[i].resolve_mode = att->resolveMode;
5450 gfx->color_att[i].resolve_iview =
5451 anv_image_view_from_handle(att->resolveImageView);
5452 gfx->color_att[i].resolve_layout = att->resolveImageLayout;
5453 }
5454 }
5455
5456 anv_cmd_graphic_state_update_has_uint_rt(gfx);
5457
5458 const struct anv_image_view *fsr_iview = NULL;
5459 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
5460 vk_find_struct_const(pRenderingInfo->pNext,
5461 RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
5462 if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
5463 fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
5464 /* imageLayout and shadingRateAttachmentTexelSize are ignored */
5465 }
5466
5467 const struct anv_image_view *ds_iview = NULL;
5468 const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
5469 const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
5470 if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
5471 (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
5472 const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
5473 VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5474 VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5475 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5476 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5477 enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
5478 enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
5479 VkClearDepthStencilValue clear_value = {};
5480
5481 if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
5482 d_iview = anv_image_view_from_handle(d_att->imageView);
5483 initial_depth_layout = attachment_initial_layout(d_att);
5484 depth_layout = d_att->imageLayout;
5485 depth_aux_usage =
5486 anv_layout_to_aux_usage(cmd_buffer->device->info,
5487 d_iview->image,
5488 VK_IMAGE_ASPECT_DEPTH_BIT,
5489 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5490 depth_layout,
5491 cmd_buffer->queue_family->queueFlags);
5492 clear_value.depth = d_att->clearValue.depthStencil.depth;
5493 }
5494
5495 if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
5496 s_iview = anv_image_view_from_handle(s_att->imageView);
5497 initial_stencil_layout = attachment_initial_layout(s_att);
5498 stencil_layout = s_att->imageLayout;
5499 stencil_aux_usage =
5500 anv_layout_to_aux_usage(cmd_buffer->device->info,
5501 s_iview->image,
5502 VK_IMAGE_ASPECT_STENCIL_BIT,
5503 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5504 stencil_layout,
5505 cmd_buffer->queue_family->queueFlags);
5506 clear_value.stencil = s_att->clearValue.depthStencil.stencil;
5507 }
5508
5509 assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
5510 ds_iview = d_iview != NULL ? d_iview : s_iview;
5511 assert(ds_iview != NULL);
5512
5513 assert(render_area.offset.x + render_area.extent.width <=
5514 ds_iview->vk.extent.width);
5515 assert(render_area.offset.y + render_area.extent.height <=
5516 ds_iview->vk.extent.height);
5517 assert(layers <= ds_iview->vk.layer_count);
5518
5519 fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
5520 fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
5521
5522 assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
5523 gfx->samples |= ds_iview->vk.image->samples;
5524
5525 VkImageAspectFlags clear_aspects = 0;
5526 if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5527 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5528 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5529 if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5530 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5531 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5532
5533 if (clear_aspects != 0) {
5534 const bool hiz_clear =
5535 anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
5536 depth_layout, clear_aspects,
5537 clear_value.depth,
5538 render_area,
5539 cmd_buffer->queue_family->queueFlags);
5540
5541 if (depth_layout != initial_depth_layout) {
5542 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5543 render_area.extent.width == d_iview->vk.extent.width &&
5544 render_area.extent.height == d_iview->vk.extent.height);
5545
5546 if (is_multiview) {
5547 u_foreach_bit(view, gfx->view_mask) {
5548 transition_depth_buffer(cmd_buffer, d_iview->image,
5549 d_iview->vk.base_mip_level, 1,
5550 d_iview->vk.base_array_layer + view,
5551 1 /* layer_count */,
5552 initial_depth_layout, depth_layout,
5553 hiz_clear);
5554 }
5555 } else {
5556 transition_depth_buffer(cmd_buffer, d_iview->image,
5557 d_iview->vk.base_mip_level, 1,
5558 d_iview->vk.base_array_layer,
5559 gfx->layer_count,
5560 initial_depth_layout, depth_layout,
5561 hiz_clear);
5562 }
5563 }
5564
5565 if (stencil_layout != initial_stencil_layout) {
5566 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5567 render_area.extent.width == s_iview->vk.extent.width &&
5568 render_area.extent.height == s_iview->vk.extent.height);
5569
5570 if (is_multiview) {
5571 u_foreach_bit(view, gfx->view_mask) {
5572 transition_stencil_buffer(cmd_buffer, s_iview->image,
5573 s_iview->vk.base_mip_level, 1,
5574 s_iview->vk.base_array_layer + view,
5575 1 /* layer_count */,
5576 initial_stencil_layout,
5577 stencil_layout,
5578 hiz_clear);
5579 }
5580 } else {
5581 transition_stencil_buffer(cmd_buffer, s_iview->image,
5582 s_iview->vk.base_mip_level, 1,
5583 s_iview->vk.base_array_layer,
5584 gfx->layer_count,
5585 initial_stencil_layout,
5586 stencil_layout,
5587 hiz_clear);
5588 }
5589 }
5590
5591 if (is_multiview) {
5592 u_foreach_bit(view, gfx->view_mask) {
5593 uint32_t level = ds_iview->vk.base_mip_level;
5594 uint32_t layer = ds_iview->vk.base_array_layer + view;
5595
5596 if (hiz_clear) {
5597 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5598 clear_aspects,
5599 level, layer, 1,
5600 render_area, &clear_value);
5601 } else {
5602 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5603 clear_aspects,
5604 depth_aux_usage,
5605 level, layer, 1,
5606 render_area, &clear_value);
5607 }
5608 }
5609 } else {
5610 uint32_t level = ds_iview->vk.base_mip_level;
5611 uint32_t base_layer = ds_iview->vk.base_array_layer;
5612 uint32_t layer_count = gfx->layer_count;
5613
5614 if (hiz_clear) {
5615 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5616 clear_aspects,
5617 level, base_layer, layer_count,
5618 render_area, &clear_value);
5619 } else {
5620 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5621 clear_aspects,
5622 depth_aux_usage,
5623 level, base_layer, layer_count,
5624 render_area, &clear_value);
5625 }
5626 }
5627 } else {
5628 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5629 assert(depth_layout == initial_depth_layout);
5630 assert(stencil_layout == initial_stencil_layout);
5631 }
5632
5633 if (d_iview != NULL) {
5634 gfx->depth_att.vk_format = d_iview->vk.format;
5635 gfx->depth_att.iview = d_iview;
5636 gfx->depth_att.layout = depth_layout;
5637 gfx->depth_att.aux_usage = depth_aux_usage;
5638 if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5639 assert(d_att->resolveImageView != VK_NULL_HANDLE);
5640 gfx->depth_att.resolve_mode = d_att->resolveMode;
5641 gfx->depth_att.resolve_iview =
5642 anv_image_view_from_handle(d_att->resolveImageView);
5643 gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
5644 }
5645 }
5646
5647 if (s_iview != NULL) {
5648 gfx->stencil_att.vk_format = s_iview->vk.format;
5649 gfx->stencil_att.iview = s_iview;
5650 gfx->stencil_att.layout = stencil_layout;
5651 gfx->stencil_att.aux_usage = stencil_aux_usage;
5652 if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5653 assert(s_att->resolveImageView != VK_NULL_HANDLE);
5654 gfx->stencil_att.resolve_mode = s_att->resolveMode;
5655 gfx->stencil_att.resolve_iview =
5656 anv_image_view_from_handle(s_att->resolveImageView);
5657 gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
5658 }
5659 }
5660 }
5661
5662 /* Finally, now that we know the right size, set up the null surface */
5663 assert(util_bitcount(gfx->samples) <= 1);
5664 isl_null_fill_state(&cmd_buffer->device->isl_dev,
5665 gfx->null_surface_state.map,
5666 .size = fb_size);
5667
5668 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5669 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
5670 continue;
5671
5672 isl_null_fill_state(&cmd_buffer->device->isl_dev,
5673 gfx->color_att[i].surface_state.state.map,
5674 .size = fb_size);
5675 }
5676
5677 /****** We can now start emitting code to begin the render pass ******/
5678
5679 gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
5680
5681 /* It is possible to start a render pass with an old pipeline. Because the
5682 * render pass and subpass index are both baked into the pipeline, this is
5683 * highly unlikely. In order to do so, it requires that you have a render
5684 * pass with a single subpass and that you use that render pass twice
5685 * back-to-back and use the same pipeline at the start of the second render
5686 * pass as at the end of the first. In order to avoid unpredictable issues
5687 * with this edge case, we just dirty the pipeline at the start of every
5688 * subpass.
5689 */
5690 gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
5691
5692 #if GFX_VER >= 11
5693 if (render_target_change) {
5694 /* The PIPE_CONTROL command description says:
5695 *
5696 * "Whenever a Binding Table Index (BTI) used by a Render Target Message
5697 * points to a different RENDER_SURFACE_STATE, SW must issue a Render
5698 * Target Cache Flush by enabling this bit. When render target flush
5699 * is set due to new association of BTI, PS Scoreboard Stall bit must
5700 * be set in this packet."
5701 *
5702 * We assume that a new BeginRendering is always changing the RTs, which
5703 * may not be true and cause excessive flushing. We can trivially skip it
5704 * in the case that there are no RTs (depth-only rendering), though.
5705 */
5706 anv_add_pending_pipe_bits(cmd_buffer,
5707 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
5708 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
5709 "change RT");
5710 }
5711 #endif
5712
5713 cmd_buffer_emit_depth_stencil(cmd_buffer);
5714
5715 cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
5716 }
5717
5718 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)5719 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
5720 struct anv_attachment *att,
5721 VkImageAspectFlagBits aspect)
5722 {
5723 #if GFX_VER < 20
5724 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5725 const struct anv_image_view *iview = att->iview;
5726
5727 if (iview == NULL)
5728 return;
5729
5730 if (gfx->view_mask == 0) {
5731 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5732 aspect, att->aux_usage,
5733 iview->planes[0].isl.base_level,
5734 iview->planes[0].isl.base_array_layer,
5735 gfx->layer_count);
5736 } else {
5737 uint32_t res_view_mask = gfx->view_mask;
5738 while (res_view_mask) {
5739 int i = u_bit_scan(&res_view_mask);
5740
5741 const uint32_t level = iview->planes[0].isl.base_level;
5742 const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
5743
5744 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5745 aspect, att->aux_usage,
5746 level, layer, 1);
5747 }
5748 }
5749 #endif
5750 }
5751
genX(CmdEndRendering)5752 void genX(CmdEndRendering)(
5753 VkCommandBuffer commandBuffer)
5754 {
5755 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5756 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5757
5758 if (anv_batch_has_error(&cmd_buffer->batch))
5759 return;
5760
5761 const bool is_multiview = gfx->view_mask != 0;
5762 const uint32_t layers =
5763 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5764
5765 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5766 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
5767 VK_IMAGE_ASPECT_COLOR_BIT);
5768 }
5769
5770 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
5771 VK_IMAGE_ASPECT_DEPTH_BIT);
5772
5773 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
5774 VK_IMAGE_ASPECT_STENCIL_BIT);
5775
5776
5777 if (!(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5778 bool has_color_resolve = false;
5779 UNUSED bool has_sparse_color_resolve = false;
5780
5781 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5782 if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE) {
5783 has_color_resolve = true;
5784 has_sparse_color_resolve |=
5785 anv_image_is_sparse(gfx->color_att[i].iview->image);
5786 }
5787 }
5788
5789 if (has_color_resolve) {
5790 /* We are about to do some MSAA resolves. We need to flush so that
5791 * the result of writes to the MSAA color attachments show up in the
5792 * sampler when we blit to the single-sampled resolve target.
5793 */
5794 anv_add_pending_pipe_bits(cmd_buffer,
5795 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5796 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
5797 "MSAA resolve");
5798 }
5799
5800 const bool has_depth_resolve =
5801 gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE;
5802 const bool has_stencil_resolve =
5803 gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE;
5804
5805 if (has_depth_resolve || has_stencil_resolve) {
5806 /* We are about to do some MSAA resolves. We need to flush so that
5807 * the result of writes to the MSAA depth attachments show up in the
5808 * sampler when we blit to the single-sampled resolve target.
5809 */
5810 anv_add_pending_pipe_bits(cmd_buffer,
5811 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5812 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5813 "MSAA resolve");
5814 }
5815
5816 #if GFX_VER < 20
5817 const bool has_sparse_depth_resolve =
5818 has_depth_resolve &&
5819 anv_image_is_sparse(gfx->depth_att.iview->image);
5820 const bool has_sparse_stencil_resolve =
5821 has_stencil_resolve &&
5822 anv_image_is_sparse(gfx->stencil_att.iview->image);
5823 /* Our HW implementation of the sparse feature prior to Xe2 lives in the
5824 * GAM unit (interface between all the GPU caches and external memory).
5825 * As a result writes to NULL bound images & buffers that should be
5826 * ignored are actually still visible in the caches. The only way for us
5827 * to get correct NULL bound regions to return 0s is to evict the caches
5828 * to force the caches to be repopulated with 0s.
5829 *
5830 * Our understanding is that Xe2 started to tag the L3 cache with some
5831 * kind physical address information rather. It is therefore able to
5832 * detect that a cache line in the cache is going to a null tile and so
5833 * the L3 cache also has a sparse compatible behavior and we don't need
5834 * to flush anymore.
5835 */
5836 if (has_sparse_color_resolve || has_sparse_depth_resolve ||
5837 has_sparse_stencil_resolve) {
5838 /* If the resolve image is sparse we need some extra bits to make
5839 * sure unbound regions read 0, as residencyNonResidentStrict
5840 * mandates.
5841 */
5842 anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_TILE_CACHE_FLUSH_BIT,
5843 "sparse MSAA resolve");
5844 }
5845 #endif
5846
5847 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5848 const struct anv_attachment *att = &gfx->color_att[i];
5849 if (att->resolve_mode == VK_RESOLVE_MODE_NONE)
5850 continue;
5851
5852 anv_attachment_msaa_resolve(cmd_buffer, att, att->layout,
5853 VK_IMAGE_ASPECT_COLOR_BIT);
5854 }
5855
5856 if (has_depth_resolve) {
5857 const struct anv_image_view *src_iview = gfx->depth_att.iview;
5858
5859 /* MSAA resolves sample from the source attachment. Transition the
5860 * depth attachment first to get rid of any HiZ that we may not be
5861 * able to handle.
5862 */
5863 transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
5864 src_iview->planes[0].isl.base_array_layer,
5865 layers,
5866 gfx->depth_att.layout,
5867 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5868 false /* will_full_fast_clear */);
5869
5870 anv_attachment_msaa_resolve(cmd_buffer, &gfx->depth_att,
5871 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5872 VK_IMAGE_ASPECT_DEPTH_BIT);
5873
5874 /* Transition the source back to the original layout. This seems a
5875 * bit inefficient but, since HiZ resolves aren't destructive, going
5876 * from less HiZ to more is generally a no-op.
5877 */
5878 transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
5879 src_iview->planes[0].isl.base_array_layer,
5880 layers,
5881 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5882 gfx->depth_att.layout,
5883 false /* will_full_fast_clear */);
5884 }
5885
5886 if (has_stencil_resolve) {
5887 anv_attachment_msaa_resolve(cmd_buffer, &gfx->stencil_att,
5888 gfx->stencil_att.layout,
5889 VK_IMAGE_ASPECT_STENCIL_BIT);
5890 }
5891 }
5892
5893 trace_intel_end_render_pass(&cmd_buffer->trace,
5894 gfx->render_area.extent.width,
5895 gfx->render_area.extent.height,
5896 gfx->color_att_count,
5897 gfx->samples);
5898
5899 anv_cmd_buffer_reset_rendering(cmd_buffer);
5900 }
5901
5902 void
genX(cmd_emit_conditional_render_predicate)5903 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5904 {
5905 struct mi_builder b;
5906 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5907
5908 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5909 mi_reg32(ANV_PREDICATE_RESULT_REG));
5910 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5911
5912 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5913 mip.LoadOperation = LOAD_LOADINV;
5914 mip.CombineOperation = COMBINE_SET;
5915 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5916 }
5917 }
5918
genX(CmdBeginConditionalRenderingEXT)5919 void genX(CmdBeginConditionalRenderingEXT)(
5920 VkCommandBuffer commandBuffer,
5921 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
5922 {
5923 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5924 ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5925 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5926 struct anv_address value_address =
5927 anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5928
5929 const bool isInverted = pConditionalRenderingBegin->flags &
5930 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5931
5932 cmd_state->conditional_render_enabled = true;
5933
5934 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5935
5936 struct mi_builder b;
5937 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5938 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &value_address);
5939 mi_builder_set_mocs(&b, mocs);
5940
5941 /* Section 19.4 of the Vulkan 1.1.85 spec says:
5942 *
5943 * If the value of the predicate in buffer memory changes
5944 * while conditional rendering is active, the rendering commands
5945 * may be discarded in an implementation-dependent way.
5946 * Some implementations may latch the value of the predicate
5947 * upon beginning conditional rendering while others
5948 * may read it before every rendering command.
5949 *
5950 * So it's perfectly fine to read a value from the buffer once.
5951 */
5952 struct mi_value value = mi_mem32(value_address);
5953
5954 /* Precompute predicate result, it is necessary to support secondary
5955 * command buffers since it is unknown if conditional rendering is
5956 * inverted when populating them.
5957 */
5958 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5959 isInverted ? mi_uge(&b, mi_imm(0), value) :
5960 mi_ult(&b, mi_imm(0), value));
5961 }
5962
genX(CmdEndConditionalRenderingEXT)5963 void genX(CmdEndConditionalRenderingEXT)(
5964 VkCommandBuffer commandBuffer)
5965 {
5966 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5967 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5968
5969 cmd_state->conditional_render_enabled = false;
5970 }
5971
5972 /* Set of stage bits for which are pipelined, i.e. they get queued
5973 * by the command streamer for later execution.
5974 */
5975 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5976 ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5977 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5978 VK_PIPELINE_STAGE_2_HOST_BIT | \
5979 VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5980
genX(CmdSetEvent2)5981 void genX(CmdSetEvent2)(
5982 VkCommandBuffer commandBuffer,
5983 VkEvent _event,
5984 const VkDependencyInfo* pDependencyInfo)
5985 {
5986 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5987 ANV_FROM_HANDLE(anv_event, event, _event);
5988
5989 switch (cmd_buffer->batch.engine_class) {
5990 case INTEL_ENGINE_CLASS_VIDEO:
5991 case INTEL_ENGINE_CLASS_COPY:
5992 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5993 flush.PostSyncOperation = WriteImmediateData;
5994 flush.Address = anv_state_pool_state_address(
5995 &cmd_buffer->device->dynamic_state_pool,
5996 event->state);
5997 flush.ImmediateData = VK_EVENT_SET;
5998 }
5999 break;
6000
6001 case INTEL_ENGINE_CLASS_RENDER:
6002 case INTEL_ENGINE_CLASS_COMPUTE: {
6003 VkPipelineStageFlags2 src_stages = 0;
6004
6005 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
6006 src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
6007 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
6008 src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
6009 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
6010 src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
6011
6012 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6013 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6014
6015 enum anv_pipe_bits pc_bits = 0;
6016 if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
6017 pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
6018 pc_bits |= ANV_PIPE_CS_STALL_BIT;
6019 }
6020
6021 genx_batch_emit_pipe_control_write
6022 (&cmd_buffer->batch, cmd_buffer->device->info,
6023 cmd_buffer->state.current_pipeline, WriteImmediateData,
6024 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
6025 event->state),
6026 VK_EVENT_SET, pc_bits);
6027 break;
6028 }
6029
6030 default:
6031 unreachable("Invalid engine class");
6032 }
6033 }
6034
genX(CmdResetEvent2)6035 void genX(CmdResetEvent2)(
6036 VkCommandBuffer commandBuffer,
6037 VkEvent _event,
6038 VkPipelineStageFlags2 stageMask)
6039 {
6040 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6041 ANV_FROM_HANDLE(anv_event, event, _event);
6042
6043 switch (cmd_buffer->batch.engine_class) {
6044 case INTEL_ENGINE_CLASS_VIDEO:
6045 case INTEL_ENGINE_CLASS_COPY:
6046 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
6047 flush.PostSyncOperation = WriteImmediateData;
6048 flush.Address = anv_state_pool_state_address(
6049 &cmd_buffer->device->dynamic_state_pool,
6050 event->state);
6051 flush.ImmediateData = VK_EVENT_RESET;
6052 }
6053 break;
6054
6055 case INTEL_ENGINE_CLASS_RENDER:
6056 case INTEL_ENGINE_CLASS_COMPUTE: {
6057 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
6058 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6059
6060 enum anv_pipe_bits pc_bits = 0;
6061 if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
6062 pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
6063 pc_bits |= ANV_PIPE_CS_STALL_BIT;
6064 }
6065
6066 genx_batch_emit_pipe_control_write
6067 (&cmd_buffer->batch, cmd_buffer->device->info,
6068 cmd_buffer->state.current_pipeline, WriteImmediateData,
6069 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
6070 event->state),
6071 VK_EVENT_RESET,
6072 pc_bits);
6073 break;
6074 }
6075
6076 default:
6077 unreachable("Invalid engine class");
6078 }
6079 }
6080
genX(CmdWaitEvents2)6081 void genX(CmdWaitEvents2)(
6082 VkCommandBuffer commandBuffer,
6083 uint32_t eventCount,
6084 const VkEvent* pEvents,
6085 const VkDependencyInfo* pDependencyInfos)
6086 {
6087 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6088
6089 for (uint32_t i = 0; i < eventCount; i++) {
6090 ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
6091
6092 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6093 sem.WaitMode = PollingMode;
6094 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6095 sem.SemaphoreDataDword = VK_EVENT_SET;
6096 sem.SemaphoreAddress = anv_state_pool_state_address(
6097 &cmd_buffer->device->dynamic_state_pool,
6098 event->state);
6099 }
6100 }
6101
6102 cmd_buffer_barrier(cmd_buffer, eventCount, pDependencyInfos, "wait event");
6103 }
6104
vk_to_intel_index_type(VkIndexType type)6105 static uint32_t vk_to_intel_index_type(VkIndexType type)
6106 {
6107 switch (type) {
6108 case VK_INDEX_TYPE_UINT8_KHR:
6109 return INDEX_BYTE;
6110 case VK_INDEX_TYPE_UINT16:
6111 return INDEX_WORD;
6112 case VK_INDEX_TYPE_UINT32:
6113 return INDEX_DWORD;
6114 default:
6115 unreachable("invalid index type");
6116 }
6117 }
6118
genX(CmdBindIndexBuffer2KHR)6119 void genX(CmdBindIndexBuffer2KHR)(
6120 VkCommandBuffer commandBuffer,
6121 VkBuffer _buffer,
6122 VkDeviceSize offset,
6123 VkDeviceSize size,
6124 VkIndexType indexType)
6125 {
6126 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6127 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
6128
6129 uint32_t restart_index = vk_index_to_restart(indexType);
6130 if (cmd_buffer->state.gfx.restart_index != restart_index) {
6131 cmd_buffer->state.gfx.restart_index = restart_index;
6132 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RESTART_INDEX;
6133 }
6134
6135 uint32_t index_size = buffer ? vk_buffer_range(&buffer->vk, offset, size) : 0;
6136 uint32_t index_type = vk_to_intel_index_type(indexType);
6137 if (cmd_buffer->state.gfx.index_buffer != buffer ||
6138 cmd_buffer->state.gfx.index_type != index_type ||
6139 cmd_buffer->state.gfx.index_offset != offset ||
6140 cmd_buffer->state.gfx.index_size != index_size) {
6141 cmd_buffer->state.gfx.index_buffer = buffer;
6142 cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
6143 cmd_buffer->state.gfx.index_offset = offset;
6144 cmd_buffer->state.gfx.index_size = index_size;
6145 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
6146 }
6147 }
6148
genX(CmdSetPerformanceOverrideINTEL)6149 VkResult genX(CmdSetPerformanceOverrideINTEL)(
6150 VkCommandBuffer commandBuffer,
6151 const VkPerformanceOverrideInfoINTEL* pOverrideInfo)
6152 {
6153 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6154
6155 switch (pOverrideInfo->type) {
6156 case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
6157 anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
6158 csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
6159 csdm2.MediaInstructionDisable = pOverrideInfo->enable;
6160 csdm2._3DRenderingInstructionDisableMask = true;
6161 csdm2.MediaInstructionDisableMask = true;
6162 }
6163 break;
6164 }
6165
6166 case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
6167 if (pOverrideInfo->enable) {
6168 /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
6169 anv_add_pending_pipe_bits(cmd_buffer,
6170 ANV_PIPE_BARRIER_FLUSH_BITS |
6171 ANV_PIPE_INVALIDATE_BITS,
6172 "perf counter isolation");
6173 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6174 }
6175 break;
6176
6177 default:
6178 unreachable("Invalid override");
6179 }
6180
6181 return VK_SUCCESS;
6182 }
6183
genX(CmdSetPerformanceStreamMarkerINTEL)6184 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
6185 VkCommandBuffer commandBuffer,
6186 const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)
6187 {
6188 /* TODO: Waiting on the register to write, might depend on generation. */
6189
6190 return VK_SUCCESS;
6191 }
6192
6193 #define TIMESTAMP 0x2358
6194
genX(cmd_emit_timestamp)6195 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
6196 struct anv_device *device,
6197 struct anv_address addr,
6198 enum anv_timestamp_capture_type type,
6199 void *data) {
6200 /* Make sure ANV_TIMESTAMP_CAPTURE_AT_CS_STALL and
6201 * ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER capture type are not set for
6202 * transfer queue.
6203 */
6204 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
6205 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
6206 assert(type != ANV_TIMESTAMP_CAPTURE_AT_CS_STALL &&
6207 type != ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER);
6208 }
6209
6210 switch (type) {
6211 case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
6212 struct mi_builder b;
6213 mi_builder_init(&b, device->info, batch);
6214 mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
6215 break;
6216 }
6217
6218 case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: {
6219 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
6220 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
6221 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6222 if (intel_needs_workaround(device->info, 16018063123))
6223 genX(batch_emit_fast_color_dummy_blit)(batch, device);
6224 anv_batch_emit(batch, GENX(MI_FLUSH_DW), fd) {
6225 fd.PostSyncOperation = WriteTimestamp;
6226 fd.Address = addr;
6227 }
6228 } else {
6229 genx_batch_emit_pipe_control_write(batch, device->info, 0,
6230 WriteTimestamp, addr, 0, 0);
6231 }
6232 break;
6233 }
6234
6235 case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
6236 genx_batch_emit_pipe_control_write
6237 (batch, device->info, 0, WriteTimestamp, addr, 0,
6238 ANV_PIPE_CS_STALL_BIT);
6239 break;
6240
6241 #if GFX_VERx10 >= 125
6242 case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
6243 uint32_t dwords[GENX(COMPUTE_WALKER_length)];
6244
6245 GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
6246 .body = {
6247 .PostSync = (struct GENX(POSTSYNC_DATA)) {
6248 .Operation = WriteTimestamp,
6249 .DestinationAddress = addr,
6250 .MOCS = anv_mocs(device, NULL, 0),
6251 },
6252 }
6253 });
6254
6255 for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++) {
6256 if (dwords[i])
6257 ((uint32_t *)data)[i] |= dwords[i];
6258 }
6259 break;
6260 }
6261
6262 case ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH: {
6263 uint32_t dwords[GENX(EXECUTE_INDIRECT_DISPATCH_length)];
6264
6265 GENX(EXECUTE_INDIRECT_DISPATCH_pack)
6266 (batch, dwords, &(struct GENX(EXECUTE_INDIRECT_DISPATCH)) {
6267 .MOCS = anv_mocs(device, NULL, 0),
6268 .COMPUTE_WALKER_BODY = {
6269 .PostSync = (struct GENX(POSTSYNC_DATA)) {
6270 .Operation = WriteTimestamp,
6271 .DestinationAddress = addr,
6272 .MOCS = anv_mocs(device, NULL, 0),
6273 },
6274 }
6275 });
6276
6277 for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++) {
6278 if (dwords[i])
6279 ((uint32_t *)data)[i] |= dwords[i];
6280 }
6281 break;
6282 }
6283 #endif
6284
6285 default:
6286 unreachable("invalid");
6287 }
6288 }
6289
genX(cmd_capture_data)6290 void genX(cmd_capture_data)(struct anv_batch *batch,
6291 struct anv_device *device,
6292 struct anv_address dst_addr,
6293 struct anv_address src_addr,
6294 uint32_t size_B) {
6295 struct mi_builder b;
6296 mi_builder_init(&b, device->info, batch);
6297 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
6298 mi_memcpy(&b, dst_addr, src_addr, size_B);
6299 }
6300
genX(batch_emit_secondary_call)6301 void genX(batch_emit_secondary_call)(struct anv_batch *batch,
6302 struct anv_device *device,
6303 struct anv_address secondary_addr,
6304 struct anv_address secondary_return_addr)
6305 {
6306 struct mi_builder b;
6307 mi_builder_init(&b, device->info, batch);
6308 mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false));
6309 /* Make sure the write in the batch buffer lands before we just execute the
6310 * jump.
6311 */
6312 mi_builder_set_write_check(&b, true);
6313
6314 /* Emit a write to change the return address of the secondary */
6315 struct mi_reloc_imm_token reloc =
6316 mi_store_relocated_imm(&b, mi_mem64(secondary_return_addr));
6317
6318 /* Ensure the write have landed before CS reads the address written
6319 * above
6320 */
6321 mi_ensure_write_fence(&b);
6322
6323 #if GFX_VER >= 12
6324 /* Disable prefetcher before jumping into a secondary */
6325 anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
6326 arb.PreParserDisableMask = true;
6327 arb.PreParserDisable = true;
6328 }
6329 #endif
6330
6331 /* Jump into the secondary */
6332 anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
6333 bbs.AddressSpaceIndicator = ASI_PPGTT;
6334 bbs.SecondLevelBatchBuffer = Firstlevelbatch;
6335 bbs.BatchBufferStartAddress = secondary_addr;
6336 }
6337
6338 /* Replace the return address written by the MI_STORE_DATA_IMM above with
6339 * the primary's current batch address (immediately after the jump).
6340 */
6341 mi_relocate_store_imm(reloc,
6342 anv_address_physical(
6343 anv_batch_current_address(batch)));
6344 }
6345
6346 void *
genX(batch_emit_return)6347 genX(batch_emit_return)(struct anv_batch *batch)
6348 {
6349 return anv_batch_emitn(batch,
6350 GENX(MI_BATCH_BUFFER_START_length),
6351 GENX(MI_BATCH_BUFFER_START),
6352 .AddressSpaceIndicator = ASI_PPGTT,
6353 .SecondLevelBatchBuffer = Firstlevelbatch);
6354 }
6355
6356 /* Wa_16018063123 */
6357 ALWAYS_INLINE void
genX(batch_emit_fast_color_dummy_blit)6358 genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
6359 struct anv_device *device)
6360 {
6361 #if GFX_VERx10 >= 125
6362 anv_batch_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
6363 blt.DestinationBaseAddress = device->workaround_address;
6364 blt.DestinationMOCS = device->isl_dev.mocs.blitter_dst;
6365 blt.DestinationPitch = 63;
6366 blt.DestinationX2 = 1;
6367 blt.DestinationY2 = 4;
6368 blt.DestinationSurfaceWidth = 1;
6369 blt.DestinationSurfaceHeight = 4;
6370 blt.DestinationSurfaceType = XY_SURFTYPE_2D;
6371 blt.DestinationSurfaceQPitch = 4;
6372 blt.DestinationTiling = XY_TILE_LINEAR;
6373 }
6374 #endif
6375 }
6376
6377 void
genX(urb_workaround)6378 genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
6379 const struct intel_urb_config *urb_cfg)
6380 {
6381 #if INTEL_NEEDS_WA_16014912113
6382 const struct intel_urb_config *current =
6383 &cmd_buffer->state.gfx.urb_cfg;
6384 if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) &&
6385 current->size[0] != 0) {
6386 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
6387 #if GFX_VER >= 12
6388 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
6389 urb._3DCommandSubOpcode += i;
6390 urb.VSURBEntryAllocationSize = current->size[i] - 1;
6391 urb.VSURBStartingAddressSlice0 = current->start[i];
6392 urb.VSURBStartingAddressSliceN = current->start[i];
6393 urb.VSNumberofURBEntriesSlice0 = i == 0 ? 256 : 0;
6394 urb.VSNumberofURBEntriesSliceN = i == 0 ? 256 : 0;
6395 }
6396 #else
6397 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
6398 urb._3DCommandSubOpcode += i;
6399 urb.VSURBStartingAddress = current->start[i];
6400 urb.VSURBEntryAllocationSize = current->size[i] - 1;
6401 urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
6402 }
6403 #endif
6404 }
6405 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
6406 pc.HDCPipelineFlushEnable = true;
6407 }
6408 }
6409 #endif
6410 }
6411
6412 struct anv_state
genX(cmd_buffer_begin_companion_rcs_syncpoint)6413 genX(cmd_buffer_begin_companion_rcs_syncpoint)(
6414 struct anv_cmd_buffer *cmd_buffer)
6415 {
6416 #if GFX_VERx10 >= 125
6417 const struct intel_device_info *info = cmd_buffer->device->info;
6418 struct anv_state syncpoint =
6419 anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 2 * sizeof(uint32_t), 4);
6420 struct anv_address xcs_wait_addr =
6421 anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
6422 struct anv_address rcs_wait_addr = anv_address_add(xcs_wait_addr, 4);
6423
6424 /* Reset the sync point */
6425 memset(syncpoint.map, 0, 2 * sizeof(uint32_t));
6426
6427 struct mi_builder b;
6428
6429 /* On CCS:
6430 * - flush all caches & invalidate
6431 * - unblock RCS
6432 * - wait on RCS to complete
6433 * - clear the value we waited on
6434 */
6435
6436 if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
6437 anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_BARRIER_FLUSH_BITS |
6438 ANV_PIPE_INVALIDATE_BITS |
6439 ANV_PIPE_STALL_BITS,
6440 "post main cmd buffer invalidate");
6441 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6442 } else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
6443 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6444 if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
6445 genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
6446 cmd_buffer->device);
6447 }
6448 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
6449 fd.FlushCCS = true; /* Maybe handle Flush LLC */
6450 }
6451 }
6452
6453 {
6454 mi_builder_init(&b, info, &cmd_buffer->batch);
6455 mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x1));
6456 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6457 sem.WaitMode = PollingMode;
6458 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6459 sem.SemaphoreDataDword = 0x1;
6460 sem.SemaphoreAddress = xcs_wait_addr;
6461 }
6462 /* Make sure to reset the semaphore in case the command buffer is run
6463 * multiple times.
6464 */
6465 mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x0));
6466 }
6467
6468 /* On RCS:
6469 * - wait on CCS signal
6470 * - clear the value we waited on
6471 */
6472 {
6473 mi_builder_init(&b, info, &cmd_buffer->companion_rcs_cmd_buffer->batch);
6474 anv_batch_emit(&cmd_buffer->companion_rcs_cmd_buffer->batch,
6475 GENX(MI_SEMAPHORE_WAIT),
6476 sem) {
6477 sem.WaitMode = PollingMode;
6478 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6479 sem.SemaphoreDataDword = 0x1;
6480 sem.SemaphoreAddress = rcs_wait_addr;
6481 }
6482 /* Make sure to reset the semaphore in case the command buffer is run
6483 * multiple times.
6484 */
6485 mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x0));
6486 }
6487
6488 return syncpoint;
6489 #else
6490 unreachable("Not implemented");
6491 #endif
6492 }
6493
6494 void
genX(cmd_buffer_end_companion_rcs_syncpoint)6495 genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
6496 struct anv_state syncpoint)
6497 {
6498 #if GFX_VERx10 >= 125
6499 struct anv_address xcs_wait_addr =
6500 anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
6501
6502 struct mi_builder b;
6503
6504 /* On RCS:
6505 * - flush all caches & invalidate
6506 * - unblock the CCS
6507 */
6508 anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
6509 ANV_PIPE_BARRIER_FLUSH_BITS |
6510 ANV_PIPE_INVALIDATE_BITS |
6511 ANV_PIPE_STALL_BITS,
6512 "post rcs flush");
6513 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer->companion_rcs_cmd_buffer);
6514
6515 mi_builder_init(&b, cmd_buffer->device->info,
6516 &cmd_buffer->companion_rcs_cmd_buffer->batch);
6517 mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x1));
6518 #else
6519 unreachable("Not implemented");
6520 #endif
6521 }
6522
6523 void
genX(write_trtt_entries)6524 genX(write_trtt_entries)(struct anv_async_submit *submit,
6525 struct anv_trtt_bind *l3l2_binds,
6526 uint32_t n_l3l2_binds,
6527 struct anv_trtt_bind *l1_binds,
6528 uint32_t n_l1_binds)
6529 {
6530 #if GFX_VER >= 12
6531 const struct intel_device_info *devinfo =
6532 submit->queue->device->info;
6533 struct anv_batch *batch = &submit->batch;
6534
6535 /* BSpec says:
6536 * "DWord Length programmed must not exceed 0x3FE."
6537 * For a single dword write the programmed length is 2, and for a single
6538 * qword it's 3. This is the value we actually write to the register field,
6539 * so it's not considering the bias.
6540 */
6541 uint32_t dword_write_len = 2;
6542 uint32_t qword_write_len = 3;
6543 uint32_t max_dword_extra_writes = 0x3FE - dword_write_len;
6544 uint32_t max_qword_extra_writes = (0x3FE - qword_write_len) / 2;
6545
6546 /* What makes the code below quite complicated is the fact that we can
6547 * write multiple values with MI_STORE_DATA_IMM as long as the writes go to
6548 * contiguous addresses.
6549 */
6550
6551 for (uint32_t i = 0; i < n_l3l2_binds; i++) {
6552 int extra_writes = 0;
6553 for (uint32_t j = i + 1;
6554 j < n_l3l2_binds && extra_writes <= max_qword_extra_writes;
6555 j++) {
6556 if (l3l2_binds[i].pte_addr + (j - i) * 8 == l3l2_binds[j].pte_addr) {
6557 extra_writes++;
6558 } else {
6559 break;
6560 }
6561 }
6562 bool is_last_write = n_l1_binds == 0 &&
6563 i + extra_writes + 1 == n_l3l2_binds;
6564
6565 uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
6566 qword_write_len + (extra_writes * 2);
6567 uint32_t *dw;
6568 dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM),
6569 .ForceWriteCompletionCheck = is_last_write,
6570 .StoreQword = true,
6571 .Address = anv_address_from_u64(l3l2_binds[i].pte_addr),
6572 );
6573 dw += 3;
6574 for (uint32_t j = 0; j < extra_writes + 1; j++) {
6575 uint64_t entry_addr_64b = l3l2_binds[i + j].entry_addr;
6576 *dw = entry_addr_64b & 0xFFFFFFFF;
6577 dw++;
6578 *dw = (entry_addr_64b >> 32) & 0xFFFFFFFF;
6579 dw++;
6580 }
6581 assert(dw == batch->next);
6582
6583 i += extra_writes;
6584 }
6585
6586 for (uint32_t i = 0; i < n_l1_binds; i++) {
6587 int extra_writes = 0;
6588 for (uint32_t j = i + 1;
6589 j < n_l1_binds && extra_writes <= max_dword_extra_writes;
6590 j++) {
6591 if (l1_binds[i].pte_addr + (j - i) * 4 ==
6592 l1_binds[j].pte_addr) {
6593 extra_writes++;
6594 } else {
6595 break;
6596 }
6597 }
6598
6599 bool is_last_write = i + extra_writes + 1 == n_l1_binds;
6600
6601 uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
6602 dword_write_len + extra_writes;
6603 uint32_t *dw;
6604 dw = anv_batch_emitn(batch, total_len, GENX(MI_STORE_DATA_IMM),
6605 .ForceWriteCompletionCheck = is_last_write,
6606 .Address = anv_address_from_u64(l1_binds[i].pte_addr),
6607 );
6608 dw += 3;
6609 for (uint32_t j = 0; j < extra_writes + 1; j++) {
6610 *dw = (l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
6611 dw++;
6612 }
6613 assert(dw == batch->next);
6614
6615 i += extra_writes;
6616 }
6617
6618 genx_batch_emit_pipe_control(batch, devinfo, _3D,
6619 ANV_PIPE_CS_STALL_BIT |
6620 ANV_PIPE_TLB_INVALIDATE_BIT);
6621 #else
6622 unreachable("Not implemented");
6623 #endif
6624 }
6625
6626 void
genX(async_submit_end)6627 genX(async_submit_end)(struct anv_async_submit *submit)
6628 {
6629 struct anv_batch *batch = &submit->batch;
6630 anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
6631 }
6632
6633 void
genX(CmdWriteBufferMarker2AMD)6634 genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer,
6635 VkPipelineStageFlags2 stage,
6636 VkBuffer dstBuffer,
6637 VkDeviceSize dstOffset,
6638 uint32_t marker)
6639 {
6640 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
6641 ANV_FROM_HANDLE(anv_buffer, buffer, dstBuffer);
6642
6643 /* The barriers inserted by the application to make dstBuffer writable
6644 * should already have the L1/L2 cache flushes. On platforms where the
6645 * command streamer is not coherent with L3, we need an additional set of
6646 * cache flushes.
6647 */
6648 enum anv_pipe_bits bits =
6649 (ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info) ? 0 :
6650 (ANV_PIPE_DATA_CACHE_FLUSH_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT)) |
6651 ANV_PIPE_END_OF_PIPE_SYNC_BIT;
6652
6653 trace_intel_begin_write_buffer_marker(&cmd_buffer->trace);
6654
6655 anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker");
6656 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
6657
6658 struct mi_builder b;
6659 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
6660
6661 /* Emitting a PIPE_CONTROL with Post-Sync Op = Write Immediate Data
6662 * would be the logical way to implement this extension, as it could
6663 * do a pipelined marker write. Unfortunately, it requires writing
6664 * whole 64-bit QWords, and VK_AMD_buffer_marker requires writing a
6665 * 32-bit value. MI_STORE_DATA_IMM is the only good way to do that,
6666 * and unfortunately it requires stalling.
6667 */
6668 mi_store(&b, mi_mem32(anv_address_add(buffer->address, dstOffset)),
6669 mi_imm(marker));
6670
6671 trace_intel_end_write_buffer_marker(&cmd_buffer->trace);
6672 }
6673
6674 void
genX(cmd_write_buffer_cp)6675 genX(cmd_write_buffer_cp)(struct anv_cmd_buffer *cmd_buffer,
6676 VkDeviceAddress dstAddr,
6677 void *data,
6678 uint32_t size)
6679 {
6680 assert(size % 4 == 0);
6681 struct anv_address addr = anv_address_from_u64(dstAddr);
6682
6683 struct mi_builder b;
6684 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
6685
6686 for (uint32_t i = 0; i < size; i += 8) {
6687 mi_builder_set_write_check(&b, i >= size - 8);
6688 if (size - i < 8) {
6689 mi_store(&b, mi_mem32(anv_address_add(addr, i)),
6690 mi_imm(*((uint32_t *)((char*)data + i))));
6691 } else {
6692 mi_store(&b, mi_mem64(anv_address_add(addr, i)),
6693 mi_imm(*((uint64_t *)((char*)data + i))));
6694 }
6695 }
6696 }
6697