1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31
32 #include "common/intel_aux_map.h"
33 #include "genxml/gen_macros.h"
34 #include "genxml/genX_pack.h"
35 #include "genxml/genX_rt_pack.h"
36 #include "common/intel_genX_state_brw.h"
37
38 #include "ds/intel_tracepoints.h"
39
40 /* We reserve :
41 * - GPR 14 for secondary command buffer returns
42 * - GPR 15 for conditional rendering
43 */
44 #define MI_BUILDER_NUM_ALLOC_GPRS 14
45 #define __gen_get_batch_dwords anv_batch_emit_dwords
46 #define __gen_address_offset anv_address_add
47 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
48 #include "common/mi_builder.h"
49
50 #include "genX_cmd_draw_generated_flush.h"
51
52 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
53 uint32_t pipeline);
54
55 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)56 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
57 enum anv_pipe_bits bits = 0;
58 bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
59 bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
60 #if GFX_VERx10 >= 125
61 bits |= (pc->PSSStallSyncEnable) ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
62 #endif
63 #if GFX_VER == 12
64 bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
65 #endif
66 #if GFX_VER >= 12
67 bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
68 #endif
69 bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
70 bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
71 bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
72 bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
73 bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
74 bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
75 bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
76 bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;
77 bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;
78 #if GFX_VERx10 == 125
79 bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0;
80 bits |= (pc->CCSFlushEnable) ? ANV_PIPE_CCS_CACHE_FLUSH_BIT : 0;
81 #endif
82 return bits;
83 }
84
85 #define anv_debug_dump_pc(pc, reason) \
86 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
87 fputs("pc: emit PC=( ", stdout); \
88 anv_dump_pipe_bits(convert_pc_to_bits(&(pc)), stdout); \
89 fprintf(stdout, ") reason: %s\n", reason); \
90 }
91
92 void
genX(cmd_buffer_emit_state_base_address)93 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
94 {
95 if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
96 anv_cmd_buffer_is_video_queue(cmd_buffer))
97 return;
98
99 struct anv_device *device = cmd_buffer->device;
100 uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
101
102 /* If we are emitting a new state base address we probably need to re-emit
103 * binding tables.
104 */
105 cmd_buffer->state.descriptors_dirty |= ~0;
106
107 #if GFX_VERx10 >= 125
108 genx_batch_emit_pipe_control(&cmd_buffer->batch,
109 cmd_buffer->device->info,
110 cmd_buffer->state.current_pipeline,
111 ANV_PIPE_CS_STALL_BIT);
112 anv_batch_emit(
113 &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
114 btpa.BindingTablePoolBaseAddress =
115 anv_cmd_buffer_surface_base_address(cmd_buffer);
116 btpa.BindingTablePoolBufferSize = device->physical->va.binding_table_pool.size / 4096;
117 btpa.MOCS = mocs;
118 }
119 #else /* GFX_VERx10 < 125 */
120 /* Emit a render target cache flush.
121 *
122 * This isn't documented anywhere in the PRM. However, it seems to be
123 * necessary prior to changing the surface state base address. Without
124 * this, we get GPU hangs when using multi-level command buffers which
125 * clear depth, reset state base address, and then go render stuff.
126 */
127 genx_batch_emit_pipe_control
128 (&cmd_buffer->batch, cmd_buffer->device->info,
129 cmd_buffer->state.current_pipeline,
130 #if GFX_VER >= 12
131 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
132 #else
133 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
134 #endif
135 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
136 ANV_PIPE_CS_STALL_BIT);
137
138 #if INTEL_NEEDS_WA_1607854226
139 /* Wa_1607854226:
140 *
141 * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
142 * mode by putting the pipeline temporarily in 3D mode.
143 */
144 uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
145 genX(flush_pipeline_select_3d)(cmd_buffer);
146 #endif
147
148 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
149 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
150 sba.GeneralStateMOCS = mocs;
151 sba.GeneralStateBaseAddressModifyEnable = true;
152
153 sba.StatelessDataPortAccessMOCS = mocs;
154
155 sba.SurfaceStateBaseAddress =
156 anv_cmd_buffer_surface_base_address(cmd_buffer);
157 sba.SurfaceStateMOCS = mocs;
158 sba.SurfaceStateBaseAddressModifyEnable = true;
159
160 sba.DynamicStateBaseAddress =
161 (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
162 sba.DynamicStateMOCS = mocs;
163 sba.DynamicStateBaseAddressModifyEnable = true;
164
165 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
166 sba.IndirectObjectMOCS = mocs;
167 sba.IndirectObjectBaseAddressModifyEnable = true;
168
169 sba.InstructionBaseAddress =
170 (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
171 sba.InstructionMOCS = mocs;
172 sba.InstructionBaseAddressModifyEnable = true;
173
174 sba.GeneralStateBufferSize = 0xfffff;
175 sba.IndirectObjectBufferSize = 0xfffff;
176 sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size +
177 device->physical->va.sampler_state_pool.size) / 4096;
178 sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096;
179 sba.GeneralStateBufferSizeModifyEnable = true;
180 sba.IndirectObjectBufferSizeModifyEnable = true;
181 sba.DynamicStateBufferSizeModifyEnable = true;
182 sba.InstructionBuffersizeModifyEnable = true;
183
184 #if GFX_VER >= 11
185 sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
186 sba.BindlessSamplerStateBufferSize = 0;
187 sba.BindlessSamplerStateMOCS = mocs;
188 sba.BindlessSamplerStateBaseAddressModifyEnable = true;
189 #endif
190
191 if (!device->physical->indirect_descriptors) {
192 #if GFX_VERx10 >= 125
193 /* Bindless Surface State & Bindless Sampler State are aligned to the
194 * same heap
195 */
196 sba.BindlessSurfaceStateBaseAddress =
197 (struct anv_address) { .offset =
198 device->physical->va.binding_table_pool.addr, };
199 sba.BindlessSurfaceStateSize =
200 (device->physical->va.internal_surface_state_pool.size +
201 device->physical->va.bindless_surface_state_pool.size) - 1;
202 sba.BindlessSurfaceStateMOCS = mocs;
203 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
204 #else
205 unreachable("Direct descriptor not supported");
206 #endif
207 } else {
208 sba.BindlessSurfaceStateBaseAddress =
209 (struct anv_address) { .offset =
210 device->physical->va.bindless_surface_state_pool.addr,
211 };
212 sba.BindlessSurfaceStateSize =
213 anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1;
214 sba.BindlessSurfaceStateMOCS = mocs;
215 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
216 }
217
218 #if GFX_VERx10 >= 125
219 sba.L1CacheControl = L1CC_WB;
220 #endif
221 }
222
223 #if INTEL_NEEDS_WA_1607854226
224 /* Wa_1607854226:
225 *
226 * Put the pipeline back into its current mode.
227 */
228 if (gfx12_wa_pipeline != UINT32_MAX)
229 genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
230 #endif
231
232 #endif /* GFX_VERx10 < 125 */
233
234 /* After re-setting the surface state base address, we have to do some
235 * cache flushing so that the sampler engine will pick up the new
236 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
237 * Shared Function > 3D Sampler > State > State Caching (page 96):
238 *
239 * Coherency with system memory in the state cache, like the texture
240 * cache is handled partially by software. It is expected that the
241 * command stream or shader will issue Cache Flush operation or
242 * Cache_Flush sampler message to ensure that the L1 cache remains
243 * coherent with system memory.
244 *
245 * [...]
246 *
247 * Whenever the value of the Dynamic_State_Base_Addr,
248 * Surface_State_Base_Addr are altered, the L1 state cache must be
249 * invalidated to ensure the new surface or sampler state is fetched
250 * from system memory.
251 *
252 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
253 * which, according the PIPE_CONTROL instruction documentation in the
254 * Broadwell PRM:
255 *
256 * Setting this bit is independent of any other bit in this packet.
257 * This bit controls the invalidation of the L1 and L2 state caches
258 * at the top of the pipe i.e. at the parsing time.
259 *
260 * Unfortunately, experimentation seems to indicate that state cache
261 * invalidation through a PIPE_CONTROL does nothing whatsoever in
262 * regards to surface state and binding tables. In stead, it seems that
263 * invalidating the texture cache is what is actually needed.
264 *
265 * XXX: As far as we have been able to determine through
266 * experimentation, shows that flush the texture cache appears to be
267 * sufficient. The theory here is that all of the sampling/rendering
268 * units cache the binding table in the texture cache. However, we have
269 * yet to be able to actually confirm this.
270 *
271 * Wa_14013910100:
272 *
273 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
274 * or program pipe control with Instruction cache invalidate post
275 * STATE_BASE_ADDRESS command"
276 */
277 enum anv_pipe_bits bits =
278 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
279 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
280 #if GFX_VERx10 == 125
281 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
282 #endif
283 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
284
285 #if GFX_VER >= 9 && GFX_VER <= 11
286 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
287 *
288 * "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
289 * always set for GPGPU workloads when “Texture Cache Invalidation
290 * Enable” bit is set".
291 *
292 * Workaround stopped appearing in TGL PRMs.
293 */
294 if (cmd_buffer->state.current_pipeline == GPGPU)
295 bits |= ANV_PIPE_CS_STALL_BIT;
296 #endif
297 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
298 cmd_buffer->state.current_pipeline,
299 bits);
300 }
301
302 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)303 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
304 struct anv_address addr)
305 {
306 VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
307 addr.bo);
308
309 if (unlikely(result != VK_SUCCESS))
310 anv_batch_set_error(&cmd_buffer->batch, result);
311 }
312
313 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,const struct anv_surface_state * state)314 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
315 const struct anv_surface_state *state)
316 {
317 assert(!anv_address_is_null(state->address));
318 add_surface_reloc(cmd_buffer, state->address);
319
320 if (!anv_address_is_null(state->aux_address)) {
321 VkResult result =
322 anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
323 state->aux_address.bo);
324 if (result != VK_SUCCESS)
325 anv_batch_set_error(&cmd_buffer->batch, result);
326 }
327
328 if (!anv_address_is_null(state->clear_address)) {
329 VkResult result =
330 anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
331 state->clear_address.bo);
332 if (result != VK_SUCCESS)
333 anv_batch_set_error(&cmd_buffer->batch, result);
334 }
335 }
336
337 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
338 * the initial layout is undefined, the HiZ buffer and depth buffer will
339 * represent the same data at the end of this operation.
340 */
341 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)342 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
343 const struct anv_image *image,
344 uint32_t base_layer, uint32_t layer_count,
345 VkImageLayout initial_layout,
346 VkImageLayout final_layout,
347 bool will_full_fast_clear)
348 {
349 const uint32_t depth_plane =
350 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
351 if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
352 return;
353
354 /* If will_full_fast_clear is set, the caller promises to fast-clear the
355 * largest portion of the specified range as it can. For depth images,
356 * that means the entire image because we don't support multi-LOD HiZ.
357 */
358 assert(image->planes[0].primary_surface.isl.levels == 1);
359 if (will_full_fast_clear)
360 return;
361
362 const enum isl_aux_state initial_state =
363 anv_layout_to_aux_state(cmd_buffer->device->info, image,
364 VK_IMAGE_ASPECT_DEPTH_BIT,
365 initial_layout,
366 cmd_buffer->queue_family->queueFlags);
367 const enum isl_aux_state final_state =
368 anv_layout_to_aux_state(cmd_buffer->device->info, image,
369 VK_IMAGE_ASPECT_DEPTH_BIT,
370 final_layout,
371 cmd_buffer->queue_family->queueFlags);
372
373 const bool initial_depth_valid =
374 isl_aux_state_has_valid_primary(initial_state);
375 const bool initial_hiz_valid =
376 isl_aux_state_has_valid_aux(initial_state);
377 const bool final_needs_depth =
378 isl_aux_state_has_valid_primary(final_state);
379 const bool final_needs_hiz =
380 isl_aux_state_has_valid_aux(final_state);
381
382 /* Getting into the pass-through state for Depth is tricky and involves
383 * both a resolve and an ambiguate. We don't handle that state right now
384 * as anv_layout_to_aux_state never returns it.
385 */
386 assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
387
388 if (final_needs_depth && !initial_depth_valid) {
389 assert(initial_hiz_valid);
390 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
391 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
392 } else if (final_needs_hiz && !initial_hiz_valid) {
393 assert(initial_depth_valid);
394 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
395 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
396 }
397
398 /* Additional tile cache flush for MTL:
399 *
400 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
401 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
402 */
403 if (intel_device_info_is_mtl(cmd_buffer->device->info) &&
404 image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
405 final_needs_depth && !initial_depth_valid) {
406 anv_add_pending_pipe_bits(cmd_buffer,
407 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
408 "HIZ-CCS flush");
409 }
410 }
411
412 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
413 * the initial layout is undefined, the HiZ buffer and depth buffer will
414 * represent the same data at the end of this operation.
415 */
416 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)417 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
418 const struct anv_image *image,
419 uint32_t base_level, uint32_t level_count,
420 uint32_t base_layer, uint32_t layer_count,
421 VkImageLayout initial_layout,
422 VkImageLayout final_layout,
423 bool will_full_fast_clear)
424 {
425 #if GFX_VER == 12
426 const uint32_t plane =
427 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
428 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
429 return;
430
431 if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
432 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
433 cmd_buffer->device->info->has_aux_map) {
434 /* If will_full_fast_clear is set, the caller promises to fast-clear the
435 * largest portion of the specified range as it can.
436 */
437 if (will_full_fast_clear)
438 return;
439
440 for (uint32_t l = 0; l < level_count; l++) {
441 const uint32_t level = base_level + l;
442 const VkRect2D clear_rect = {
443 .offset.x = 0,
444 .offset.y = 0,
445 .extent.width = u_minify(image->vk.extent.width, level),
446 .extent.height = u_minify(image->vk.extent.height, level),
447 };
448
449 uint32_t aux_layers =
450 anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
451 uint32_t level_layer_count =
452 MIN2(layer_count, aux_layers - base_layer);
453
454 /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
455 * Enable:
456 *
457 * "When enabled, Stencil Buffer needs to be initialized via
458 * stencil clear (HZ_OP) before any renderpass."
459 */
460 anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
461 level, base_layer, level_layer_count,
462 clear_rect, 0 /* Stencil clear value */);
463 }
464 }
465
466 /* Additional tile cache flush for MTL:
467 *
468 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
469 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
470 */
471 if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
472 anv_add_pending_pipe_bits(cmd_buffer,
473 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
474 "HIZ-CCS flush");
475 }
476 #endif
477 }
478
479 #define MI_PREDICATE_SRC0 0x2400
480 #define MI_PREDICATE_SRC1 0x2408
481 #define MI_PREDICATE_RESULT 0x2418
482
483 static void
set_image_compressed_bit(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t base_layer,uint32_t layer_count,bool compressed)484 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
485 const struct anv_image *image,
486 VkImageAspectFlagBits aspect,
487 uint32_t level,
488 uint32_t base_layer, uint32_t layer_count,
489 bool compressed)
490 {
491 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
492
493 /* We only have compression tracking for CCS_E */
494 if (!isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage))
495 return;
496
497 for (uint32_t a = 0; a < layer_count; a++) {
498 uint32_t layer = base_layer + a;
499 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
500 sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,
501 image, aspect,
502 level, layer);
503 sdi.ImmediateData = compressed ? UINT32_MAX : 0;
504 }
505 }
506
507 /* FCV_CCS_E images are automatically fast cleared to default value at
508 * render time. In order to account for this, anv should set the the
509 * appropriate fast clear state for level0/layer0.
510 *
511 * At the moment, tracking the fast clear state for higher levels/layers is
512 * neither supported, nor do we enter a situation where it is a concern.
513 */
514 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E &&
515 base_layer == 0 && level == 0) {
516 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
517 sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
518 image, aspect);
519 sdi.ImmediateData = ANV_FAST_CLEAR_DEFAULT_VALUE;
520 }
521 }
522 }
523
524 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)525 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
526 const struct anv_image *image,
527 VkImageAspectFlagBits aspect,
528 enum anv_fast_clear_type fast_clear)
529 {
530 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
531 sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
532 image, aspect);
533 sdi.ImmediateData = fast_clear;
534 }
535
536 /* Whenever we have fast-clear, we consider that slice to be compressed.
537 * This makes building predicates much easier.
538 */
539 if (fast_clear != ANV_FAST_CLEAR_NONE)
540 set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
541 }
542
543 /* This is only really practical on haswell and above because it requires
544 * MI math in order to get it correct.
545 */
546 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)547 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
548 const struct anv_image *image,
549 VkImageAspectFlagBits aspect,
550 uint32_t level, uint32_t array_layer,
551 enum isl_aux_op resolve_op,
552 enum anv_fast_clear_type fast_clear_supported)
553 {
554 struct anv_address addr = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
555 image, aspect);
556 struct mi_builder b;
557 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
558 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
559 mi_builder_set_mocs(&b, mocs);
560
561 const struct mi_value fast_clear_type = mi_mem32(addr);
562
563 if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
564 /* In this case, we're doing a full resolve which means we want the
565 * resolve to happen if any compression (including fast-clears) is
566 * present.
567 *
568 * In order to simplify the logic a bit, we make the assumption that,
569 * if the first slice has been fast-cleared, it is also marked as
570 * compressed. See also set_image_fast_clear_state.
571 */
572 const struct mi_value compression_state =
573 mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,
574 image, aspect,
575 level, array_layer));
576 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
577 mi_store(&b, compression_state, mi_imm(0));
578
579 if (level == 0 && array_layer == 0) {
580 /* If the predicate is true, we want to write 0 to the fast clear type
581 * and, if it's false, leave it alone. We can do this by writing
582 *
583 * clear_type = clear_type & ~predicate;
584 */
585 struct mi_value new_fast_clear_type =
586 mi_iand(&b, fast_clear_type,
587 mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
588 mi_store(&b, fast_clear_type, new_fast_clear_type);
589 }
590 } else if (level == 0 && array_layer == 0) {
591 /* In this case, we are doing a partial resolve to get rid of fast-clear
592 * colors. We don't care about the compression state but we do care
593 * about how much fast clear is allowed by the final layout.
594 */
595 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
596 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
597
598 /* We need to compute (fast_clear_supported < image->fast_clear) */
599 struct mi_value pred =
600 mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
601 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
602
603 /* If the predicate is true, we want to write 0 to the fast clear type
604 * and, if it's false, leave it alone. We can do this by writing
605 *
606 * clear_type = clear_type & ~predicate;
607 */
608 struct mi_value new_fast_clear_type =
609 mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
610 mi_store(&b, fast_clear_type, new_fast_clear_type);
611 } else {
612 /* In this case, we're trying to do a partial resolve on a slice that
613 * doesn't have clear color. There's nothing to do.
614 */
615 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
616 return;
617 }
618
619 /* Set src1 to 0 and use a != condition */
620 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
621
622 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
623 mip.LoadOperation = LOAD_LOADINV;
624 mip.CombineOperation = COMBINE_SET;
625 mip.CompareOperation = COMPARE_SRCS_EQUAL;
626 }
627 }
628
629 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)630 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
631 const struct anv_image *image,
632 enum isl_format format,
633 struct isl_swizzle swizzle,
634 VkImageAspectFlagBits aspect,
635 uint32_t level, uint32_t array_layer,
636 enum isl_aux_op resolve_op,
637 enum anv_fast_clear_type fast_clear_supported)
638 {
639 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
640
641 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
642 aspect, level, array_layer,
643 resolve_op, fast_clear_supported);
644
645 /* CCS_D only supports full resolves and BLORP will assert on us if we try
646 * to do a partial resolve on a CCS_D surface.
647 */
648 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
649 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
650 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
651
652 anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
653 level, array_layer, 1, resolve_op, NULL, true);
654 }
655
656 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)657 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
658 const struct anv_image *image,
659 enum isl_format format,
660 struct isl_swizzle swizzle,
661 VkImageAspectFlagBits aspect,
662 uint32_t array_layer,
663 enum isl_aux_op resolve_op,
664 enum anv_fast_clear_type fast_clear_supported)
665 {
666 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
667 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
668
669 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
670 aspect, 0, array_layer,
671 resolve_op, fast_clear_supported);
672
673 anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
674 array_layer, 1, resolve_op, NULL, true);
675 }
676
677 void
genX(cmd_buffer_mark_image_written)678 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
679 const struct anv_image *image,
680 VkImageAspectFlagBits aspect,
681 enum isl_aux_usage aux_usage,
682 uint32_t level,
683 uint32_t base_layer,
684 uint32_t layer_count)
685 {
686 /* The aspect must be exactly one of the image aspects. */
687 assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
688
689 /* Filter out aux usages that don't have any compression tracking.
690 * Note: We only have compression tracking for CCS_E images, but it's
691 * possible for a CCS_E enabled image to have a subresource with a different
692 * aux usage.
693 */
694 if (!isl_aux_usage_has_compression(aux_usage))
695 return;
696
697 set_image_compressed_bit(cmd_buffer, image, aspect,
698 level, base_layer, layer_count, true);
699 }
700
701 static void
init_fast_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect)702 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
703 const struct anv_image *image,
704 VkImageAspectFlagBits aspect)
705 {
706 assert(cmd_buffer && image);
707 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
708
709 /* Initialize the struct fields that are accessed for fast clears so that
710 * the HW restrictions on the field values are satisfied.
711 *
712 * On generations that do not support indirect clear color natively, we
713 * can just skip initializing the values, because they will be set by
714 * BLORP before actually doing the fast clear.
715 *
716 * For newer generations, we may not be able to skip initialization.
717 * Testing shows that writing to CLEAR_COLOR causes corruption if
718 * the surface is currently being used. So, care must be taken here.
719 * There are two cases that we consider:
720 *
721 * 1. For CCS_E without FCV, we can skip initializing the color-related
722 * fields, just like on the older platforms. Also, DWORDS 6 and 7
723 * are marked MBZ (or have a usable field on gfx11), but we can skip
724 * initializing them because in practice these fields need other
725 * state to be programmed for their values to matter.
726 *
727 * 2. When the FCV optimization is enabled, we must initialize the
728 * color-related fields. Otherwise, the engine might reference their
729 * uninitialized contents before we fill them for a manual fast clear
730 * with BLORP. Although the surface may be in use, no synchronization
731 * is needed before initialization. The only possible clear color we
732 * support in this mode is 0.
733 */
734 #if GFX_VER == 12
735 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
736
737 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
738 assert(!image->planes[plane].can_non_zero_fast_clear);
739 assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 32);
740
741 unsigned num_dwords = 6;
742 struct anv_address addr =
743 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
744
745 for (unsigned i = 0; i < num_dwords; i++) {
746 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
747 sdi.Address = addr;
748 sdi.Address.offset += i * 4;
749 sdi.ImmediateData = 0;
750 sdi.ForceWriteCompletionCheck = i == (num_dwords - 1);
751 }
752 }
753 }
754 #endif
755 }
756
757 /* Copy the fast-clear value dword(s) between a surface state object and an
758 * image's fast clear state buffer.
759 */
760 void
genX(load_image_clear_color)761 genX(load_image_clear_color)(struct anv_cmd_buffer *cmd_buffer,
762 struct anv_state surface_state,
763 const struct anv_image *image)
764 {
765 #if GFX_VER < 10
766 assert(cmd_buffer && image);
767 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
768
769 struct anv_address ss_clear_addr =
770 anv_state_pool_state_address(
771 &cmd_buffer->device->internal_surface_state_pool,
772 (struct anv_state) {
773 .offset = surface_state.offset +
774 cmd_buffer->device->isl_dev.ss.clear_value_offset
775 });
776 const struct anv_address entry_addr =
777 anv_image_get_clear_color_addr(cmd_buffer->device, image,
778 VK_IMAGE_ASPECT_COLOR_BIT);
779 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
780
781 struct mi_builder b;
782 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
783
784 mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
785
786 /* Updating a surface state object may require that the state cache be
787 * invalidated. From the SKL PRM, Shared Functions -> State -> State
788 * Caching:
789 *
790 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
791 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
792 * modified [...], the L1 state cache must be invalidated to ensure
793 * the new surface or sampler state is fetched from system memory.
794 *
795 * In testing, SKL doesn't actually seem to need this, but HSW does.
796 */
797 anv_add_pending_pipe_bits(cmd_buffer,
798 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
799 "after load_image_clear_color surface state update");
800 #endif
801 }
802
803 void
genX(set_fast_clear_state)804 genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
805 const struct anv_image *image,
806 const enum isl_format format,
807 union isl_color_value clear_color)
808 {
809 if (isl_color_value_is_zero(clear_color, format)) {
810 /* This image has the auxiliary buffer enabled. We can mark the
811 * subresource as not needing a resolve because the clear color
812 * will match what's in every RENDER_SURFACE_STATE object when
813 * it's being used for sampling.
814 */
815 set_image_fast_clear_state(cmd_buffer, image,
816 VK_IMAGE_ASPECT_COLOR_BIT,
817 ANV_FAST_CLEAR_DEFAULT_VALUE);
818 } else {
819 set_image_fast_clear_state(cmd_buffer, image,
820 VK_IMAGE_ASPECT_COLOR_BIT,
821 ANV_FAST_CLEAR_ANY);
822 }
823 }
824
825 /**
826 * @brief Transitions a color buffer from one layout to another.
827 *
828 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
829 * more information.
830 *
831 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
832 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
833 * this represents the maximum layers to transition at each
834 * specified miplevel.
835 */
836 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)837 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
838 const struct anv_image *image,
839 VkImageAspectFlagBits aspect,
840 const uint32_t base_level, uint32_t level_count,
841 uint32_t base_layer, uint32_t layer_count,
842 VkImageLayout initial_layout,
843 VkImageLayout final_layout,
844 uint32_t src_queue_family,
845 uint32_t dst_queue_family,
846 bool will_full_fast_clear)
847 {
848 struct anv_device *device = cmd_buffer->device;
849 const struct intel_device_info *devinfo = device->info;
850 /* Validate the inputs. */
851 assert(cmd_buffer);
852 assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
853 /* These values aren't supported for simplicity's sake. */
854 assert(level_count != VK_REMAINING_MIP_LEVELS &&
855 layer_count != VK_REMAINING_ARRAY_LAYERS);
856 /* Ensure the subresource range is valid. */
857 UNUSED uint64_t last_level_num = base_level + level_count;
858 const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
859 UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
860 assert((uint64_t)base_layer + layer_count <= image_layers);
861 assert(last_level_num <= image->vk.mip_levels);
862 /* If there is a layout transfer, the final layout cannot be undefined or
863 * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
864 */
865 assert(initial_layout == final_layout ||
866 (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
867 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
868 const struct isl_drm_modifier_info *isl_mod_info =
869 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
870 ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
871 : NULL;
872
873 const bool src_queue_external =
874 src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
875 src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
876
877 const bool dst_queue_external =
878 dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
879 dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
880
881 /* If the queues are external, consider the first queue family flags
882 * (should be the most capable)
883 */
884 const VkQueueFlagBits src_queue_flags =
885 device->physical->queue.families[
886 (src_queue_external || src_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
887 0 : src_queue_family].queueFlags;
888 const VkQueueFlagBits dst_queue_flags =
889 device->physical->queue.families[
890 (dst_queue_external || dst_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
891 0 : dst_queue_family].queueFlags;
892
893 /* Simultaneous acquire and release on external queues is illegal. */
894 assert(!src_queue_external || !dst_queue_external);
895
896 /* Ownership transition on an external queue requires special action if the
897 * image has a DRM format modifier because we store image data in
898 * a driver-private bo which is inaccessible to the external queue.
899 */
900 const bool private_binding_acquire =
901 src_queue_external &&
902 anv_image_is_externally_shared(image) &&
903 anv_image_has_private_binding(image);
904
905 const bool private_binding_release =
906 dst_queue_external &&
907 anv_image_is_externally_shared(image) &&
908 anv_image_has_private_binding(image);
909
910 if (initial_layout == final_layout &&
911 !private_binding_acquire && !private_binding_release) {
912 /* No work is needed. */
913 return;
914 }
915
916 /**
917 * Section 7.7.4 of the Vulkan 1.3.260 spec says:
918 *
919 * If the transfer is via an image memory barrier, and an image layout
920 * transition is desired, then the values of oldLayout and newLayout in the
921 * release operation's memory barrier must be equal to values of oldLayout
922 * and newLayout in the acquire operation's memory barrier. Although the
923 * image layout transition is submitted twice, it will only be executed
924 * once. A layout transition specified in this way happens-after the
925 * release operation and happens-before the acquire operation.
926 *
927 * Because we know that we get match transition on each queue, we choose to
928 * only do the work on one queue type : RENDER. In the cases where we do
929 * transitions between COMPUTE & TRANSFER, we should have matching
930 * aux/fast_clear value which would trigger no work in the code below.
931 */
932 if (!(src_queue_external || dst_queue_external) &&
933 src_queue_family != VK_QUEUE_FAMILY_IGNORED &&
934 dst_queue_family != VK_QUEUE_FAMILY_IGNORED &&
935 src_queue_family != dst_queue_family) {
936 enum intel_engine_class src_engine =
937 cmd_buffer->queue_family->engine_class;
938 if (src_engine != INTEL_ENGINE_CLASS_RENDER)
939 return;
940 }
941
942 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
943
944 if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
945 return;
946
947 enum isl_aux_usage initial_aux_usage =
948 anv_layout_to_aux_usage(devinfo, image, aspect, 0,
949 initial_layout, src_queue_flags);
950 enum isl_aux_usage final_aux_usage =
951 anv_layout_to_aux_usage(devinfo, image, aspect, 0,
952 final_layout, dst_queue_flags);
953 enum anv_fast_clear_type initial_fast_clear =
954 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout,
955 src_queue_flags);
956 enum anv_fast_clear_type final_fast_clear =
957 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout,
958 dst_queue_flags);
959
960 /* We must override the anv_layout_to_* functions because they are unaware
961 * of acquire/release direction.
962 */
963 if (private_binding_acquire) {
964 initial_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
965 image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
966 initial_fast_clear = isl_mod_info->supports_clear_color ?
967 initial_fast_clear : ANV_FAST_CLEAR_NONE;
968 } else if (private_binding_release) {
969 final_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
970 image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
971 final_fast_clear = isl_mod_info->supports_clear_color ?
972 final_fast_clear : ANV_FAST_CLEAR_NONE;
973 }
974
975 assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
976
977 /* The following layouts are equivalent for non-linear images. */
978 const bool initial_layout_undefined =
979 initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
980 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
981
982 bool must_init_fast_clear_state = false;
983 bool must_init_aux_surface = false;
984
985 if (initial_layout_undefined) {
986 /* The subresource may have been aliased and populated with arbitrary
987 * data.
988 */
989 must_init_fast_clear_state = true;
990
991 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
992 devinfo->has_illegal_ccs_values) {
993
994 must_init_aux_surface = true;
995
996 } else {
997 assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
998
999 /* We can start using the CCS immediately without ambiguating. The
1000 * two conditions that enable this are:
1001 *
1002 * 1) The device treats all possible CCS values as legal. In other
1003 * words, we can't confuse the hardware with random bits in the
1004 * CCS.
1005 *
1006 * 2) We enable compression on all writable image layouts. The CCS
1007 * will receive all writes and will therefore always be in sync
1008 * with the main surface.
1009 *
1010 * If we were to disable compression on some writable layouts, the
1011 * CCS could get out of sync with the main surface and the app
1012 * could lose the data it wrote previously. For example, this
1013 * could happen if an app: transitions from UNDEFINED w/o
1014 * ambiguating -> renders with AUX_NONE -> samples with AUX_CCS.
1015 *
1016 * The second condition is asserted below, but could be moved
1017 * elsewhere for more coverage (we're only checking transitions from
1018 * an undefined layout).
1019 */
1020 assert(vk_image_layout_is_read_only(final_layout, aspect) ||
1021 (final_aux_usage != ISL_AUX_USAGE_NONE));
1022
1023 must_init_aux_surface = false;
1024 }
1025
1026 } else if (private_binding_acquire) {
1027 /* The fast clear state lives in a driver-private bo, and therefore the
1028 * external/foreign queue is unaware of it.
1029 *
1030 * If this is the first time we are accessing the image, then the fast
1031 * clear state is uninitialized.
1032 *
1033 * If this is NOT the first time we are accessing the image, then the fast
1034 * clear state may still be valid and correct due to the resolve during
1035 * our most recent ownership release. However, we do not track the aux
1036 * state with MI stores, and therefore must assume the worst-case: that
1037 * this is the first time we are accessing the image.
1038 */
1039 assert(image->planes[plane].fast_clear_memory_range.binding ==
1040 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
1041 must_init_fast_clear_state = true;
1042
1043 if (anv_image_get_aux_memory_range(image, plane)->binding ==
1044 ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
1045 /* The aux surface, like the fast clear state, lives in
1046 * a driver-private bo. We must initialize the aux surface for the
1047 * same reasons we must initialize the fast clear state.
1048 */
1049 must_init_aux_surface = true;
1050 } else {
1051 /* The aux surface, unlike the fast clear state, lives in
1052 * application-visible VkDeviceMemory and is shared with the
1053 * external/foreign queue. Therefore, when we acquire ownership of the
1054 * image with a defined VkImageLayout, the aux surface is valid and has
1055 * the aux state required by the modifier.
1056 */
1057 must_init_aux_surface = false;
1058 }
1059 }
1060
1061 if (must_init_fast_clear_state) {
1062 if (base_level == 0 && base_layer == 0) {
1063 set_image_fast_clear_state(cmd_buffer, image, aspect,
1064 ANV_FAST_CLEAR_NONE);
1065 }
1066 init_fast_clear_color(cmd_buffer, image, aspect);
1067 }
1068
1069 if (must_init_aux_surface) {
1070 assert(must_init_fast_clear_state);
1071
1072 /* Initialize the aux buffers to enable correct rendering. In order to
1073 * ensure that things such as storage images work correctly, aux buffers
1074 * need to be initialized to valid data.
1075 *
1076 * Having an aux buffer with invalid data is a problem for two reasons:
1077 *
1078 * 1) Having an invalid value in the buffer can confuse the hardware.
1079 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1080 * invalid and leads to the hardware doing strange things. It
1081 * doesn't hang as far as we can tell but rendering corruption can
1082 * occur.
1083 *
1084 * 2) If this transition is into the GENERAL layout and we then use the
1085 * image as a storage image, then we must have the aux buffer in the
1086 * pass-through state so that, if we then go to texture from the
1087 * image, we get the results of our storage image writes and not the
1088 * fast clear color or other random data.
1089 *
1090 * For CCS both of the problems above are real demonstrable issues. In
1091 * that case, the only thing we can do is to perform an ambiguate to
1092 * transition the aux surface into the pass-through state.
1093 *
1094 * For MCS, (2) is never an issue because we don't support multisampled
1095 * storage images. In theory, issue (1) is a problem with MCS but we've
1096 * never seen it in the wild. For 4x and 16x, all bit patterns could,
1097 * in theory, be interpreted as something but we don't know that all bit
1098 * patterns are actually valid. For 2x and 8x, you could easily end up
1099 * with the MCS referring to an invalid plane because not all bits of
1100 * the MCS value are actually used. Even though we've never seen issues
1101 * in the wild, it's best to play it safe and initialize the MCS. We
1102 * could use a fast-clear for MCS because we only ever touch from render
1103 * and texture (no image load store). However, due to WA 14013111325,
1104 * we choose to ambiguate MCS as well.
1105 */
1106 if (image->vk.samples == 1) {
1107 for (uint32_t l = 0; l < level_count; l++) {
1108 const uint32_t level = base_level + l;
1109
1110 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1111 if (base_layer >= aux_layers)
1112 break; /* We will only get fewer layers as level increases */
1113 uint32_t level_layer_count =
1114 MIN2(layer_count, aux_layers - base_layer);
1115
1116 /* If will_full_fast_clear is set, the caller promises to
1117 * fast-clear the largest portion of the specified range as it can.
1118 * For color images, that means only the first LOD and array slice.
1119 */
1120 if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1121 base_layer++;
1122 level_layer_count--;
1123 if (level_layer_count == 0)
1124 continue;
1125 }
1126
1127 anv_image_ccs_op(cmd_buffer, image,
1128 image->planes[plane].primary_surface.isl.format,
1129 ISL_SWIZZLE_IDENTITY,
1130 aspect, level, base_layer, level_layer_count,
1131 ISL_AUX_OP_AMBIGUATE, NULL, false);
1132
1133 set_image_compressed_bit(cmd_buffer, image, aspect, level,
1134 base_layer, level_layer_count, false);
1135 }
1136 } else {
1137 /* If will_full_fast_clear is set, the caller promises to fast-clear
1138 * the largest portion of the specified range as it can.
1139 */
1140 if (will_full_fast_clear)
1141 return;
1142
1143 /* If will_full_fast_clear is set, the caller promises to fast-clear
1144 * the largest portion of the specified range as it can.
1145 */
1146 if (will_full_fast_clear)
1147 return;
1148
1149 assert(base_level == 0 && level_count == 1);
1150 anv_image_mcs_op(cmd_buffer, image,
1151 image->planes[plane].primary_surface.isl.format,
1152 ISL_SWIZZLE_IDENTITY,
1153 aspect, base_layer, layer_count,
1154 ISL_AUX_OP_AMBIGUATE, NULL, false);
1155 }
1156 return;
1157 }
1158
1159 /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1160 * We can handle transitions between CCS_D/E to and from NONE. What we
1161 * don't yet handle is switching between CCS_E and CCS_D within a given
1162 * image. Doing so in a performant way requires more detailed aux state
1163 * tracking such as what is done in i965. For now, just assume that we
1164 * only have one type of compression.
1165 */
1166 assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1167 final_aux_usage == ISL_AUX_USAGE_NONE ||
1168 initial_aux_usage == final_aux_usage);
1169
1170 /* If initial aux usage is NONE, there is nothing to resolve */
1171 if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1172 return;
1173
1174 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1175
1176 /* If the initial layout supports more fast clear than the final layout
1177 * then we need at least a partial resolve.
1178 */
1179 if (final_fast_clear < initial_fast_clear) {
1180 /* Partial resolves will actually only occur on layer 0/level 0. This
1181 * is generally okay because anv only allows explicit fast clears to
1182 * the first subresource.
1183 *
1184 * The situation is a bit different with FCV_CCS_E. With that aux
1185 * usage, implicit fast clears can occur on any layer and level.
1186 * anv doesn't track fast clear states for more than the first
1187 * subresource, so we need to assert that a layout transition doesn't
1188 * attempt to partial resolve the other subresources.
1189 *
1190 * At the moment, we don't enter such a situation, and partial resolves
1191 * for higher level/layer resources shouldn't be a concern.
1192 */
1193 if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
1194 assert(base_level == 0 && level_count == 1 &&
1195 base_layer == 0 && layer_count == 1);
1196 }
1197 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1198 }
1199
1200 if (isl_aux_usage_has_ccs_e(initial_aux_usage) &&
1201 !isl_aux_usage_has_ccs_e(final_aux_usage))
1202 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
1203
1204 if (resolve_op == ISL_AUX_OP_NONE)
1205 return;
1206
1207 /* Perform a resolve to synchronize data between the main and aux buffer.
1208 * Before we begin, we must satisfy the cache flushing requirement specified
1209 * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1210 *
1211 * Any transition from any value in {Clear, Render, Resolve} to a
1212 * different value in {Clear, Render, Resolve} requires end of pipe
1213 * synchronization.
1214 *
1215 * We perform a flush of the write cache before and after the clear and
1216 * resolve operations to meet this requirement.
1217 *
1218 * Unlike other drawing, fast clear operations are not properly
1219 * synchronized. The first PIPE_CONTROL here likely ensures that the
1220 * contents of the previous render or clear hit the render target before we
1221 * resolve and the second likely ensures that the resolve is complete before
1222 * we do any more rendering or clearing.
1223 */
1224 anv_add_pending_pipe_bits(cmd_buffer,
1225 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1226 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1227 "before transition RT");
1228
1229 for (uint32_t l = 0; l < level_count; l++) {
1230 uint32_t level = base_level + l;
1231
1232 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1233 if (base_layer >= aux_layers)
1234 break; /* We will only get fewer layers as level increases */
1235 uint32_t level_layer_count =
1236 MIN2(layer_count, aux_layers - base_layer);
1237
1238 for (uint32_t a = 0; a < level_layer_count; a++) {
1239 uint32_t array_layer = base_layer + a;
1240
1241 /* If will_full_fast_clear is set, the caller promises to fast-clear
1242 * the largest portion of the specified range as it can. For color
1243 * images, that means only the first LOD and array slice.
1244 */
1245 if (level == 0 && array_layer == 0 && will_full_fast_clear)
1246 continue;
1247
1248 if (image->vk.samples == 1) {
1249 anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1250 image->planes[plane].primary_surface.isl.format,
1251 ISL_SWIZZLE_IDENTITY,
1252 aspect, level, array_layer, resolve_op,
1253 final_fast_clear);
1254 } else {
1255 /* We only support fast-clear on the first layer so partial
1256 * resolves should not be used on other layers as they will use
1257 * the clear color stored in memory that is only valid for layer0.
1258 */
1259 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1260 array_layer != 0)
1261 continue;
1262
1263 anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1264 image->planes[plane].primary_surface.isl.format,
1265 ISL_SWIZZLE_IDENTITY,
1266 aspect, array_layer, resolve_op,
1267 final_fast_clear);
1268 }
1269 }
1270 }
1271
1272 anv_add_pending_pipe_bits(cmd_buffer,
1273 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1274 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1275 "after transition RT");
1276 }
1277
1278 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1279 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1280 uint32_t color_att_count)
1281 {
1282 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1283
1284 /* Reserve one for the NULL state. */
1285 unsigned num_states = 1 + color_att_count;
1286 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1287 const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1288 gfx->att_states =
1289 anv_cmd_buffer_alloc_surface_states(cmd_buffer, num_states);
1290 if (gfx->att_states.map == NULL)
1291 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1292
1293 struct anv_state next_state = gfx->att_states;
1294 next_state.alloc_size = isl_dev->ss.size;
1295
1296 gfx->null_surface_state = next_state;
1297 next_state.offset += ss_stride;
1298 next_state.map += ss_stride;
1299
1300 gfx->color_att_count = color_att_count;
1301 for (uint32_t i = 0; i < color_att_count; i++) {
1302 gfx->color_att[i] = (struct anv_attachment) {
1303 .surface_state.state = next_state,
1304 };
1305 next_state.offset += ss_stride;
1306 next_state.map += ss_stride;
1307 }
1308 gfx->depth_att = (struct anv_attachment) { };
1309 gfx->stencil_att = (struct anv_attachment) { };
1310
1311 return VK_SUCCESS;
1312 }
1313
1314 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1315 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1316 {
1317 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1318
1319 gfx->render_area = (VkRect2D) { };
1320 gfx->layer_count = 0;
1321 gfx->samples = 0;
1322
1323 gfx->color_att_count = 0;
1324 gfx->depth_att = (struct anv_attachment) { };
1325 gfx->stencil_att = (struct anv_attachment) { };
1326 gfx->null_surface_state = ANV_STATE_NULL;
1327 }
1328
1329 /**
1330 * Program the hardware to use the specified L3 configuration.
1331 */
1332 void
genX(cmd_buffer_config_l3)1333 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1334 const struct intel_l3_config *cfg)
1335 {
1336 assert(cfg || GFX_VER >= 12);
1337 if (cfg == cmd_buffer->state.current_l3_config)
1338 return;
1339
1340 #if GFX_VER >= 11
1341 /* On Gfx11+ we use only one config, so verify it remains the same and skip
1342 * the stalling programming entirely.
1343 */
1344 assert(cfg == cmd_buffer->device->l3_config);
1345 #else
1346 if (INTEL_DEBUG(DEBUG_L3)) {
1347 mesa_logd("L3 config transition: ");
1348 intel_dump_l3_config(cfg, stderr);
1349 }
1350
1351 /* According to the hardware docs, the L3 partitioning can only be changed
1352 * while the pipeline is completely drained and the caches are flushed,
1353 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1354 */
1355 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1356 cmd_buffer->state.current_pipeline,
1357 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1358 ANV_PIPE_CS_STALL_BIT);
1359
1360 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1361 * invalidation of the relevant caches. Note that because RO invalidation
1362 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1363 * command is processed by the CS) we cannot combine it with the previous
1364 * stalling flush as the hardware documentation suggests, because that
1365 * would cause the CS to stall on previous rendering *after* RO
1366 * invalidation and wouldn't prevent the RO caches from being polluted by
1367 * concurrent rendering before the stall completes. This intentionally
1368 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1369 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1370 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1371 * already guarantee that there is no concurrent GPGPU kernel execution
1372 * (see SKL HSD 2132585).
1373 */
1374 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1375 cmd_buffer->state.current_pipeline,
1376 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1377 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1378 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1379 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
1380
1381 /* Now send a third stalling flush to make sure that invalidation is
1382 * complete when the L3 configuration registers are modified.
1383 */
1384 genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
1385 cmd_buffer->state.current_pipeline,
1386 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1387 ANV_PIPE_CS_STALL_BIT);
1388
1389 genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1390 #endif /* GFX_VER >= 11 */
1391 cmd_buffer->state.current_l3_config = cfg;
1392 }
1393
1394 ALWAYS_INLINE void
genX(invalidate_aux_map)1395 genX(invalidate_aux_map)(struct anv_batch *batch,
1396 struct anv_device *device,
1397 enum intel_engine_class engine_class,
1398 enum anv_pipe_bits bits)
1399 {
1400 #if GFX_VER == 12
1401 if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) {
1402 uint32_t register_addr = 0;
1403 switch (engine_class) {
1404 case INTEL_ENGINE_CLASS_COMPUTE:
1405 register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
1406 break;
1407 case INTEL_ENGINE_CLASS_COPY:
1408 #if GFX_VERx10 >= 125
1409 register_addr = GENX(BCS_CCS_AUX_INV_num);
1410 #endif
1411 break;
1412 case INTEL_ENGINE_CLASS_VIDEO:
1413 register_addr = GENX(VD0_CCS_AUX_INV_num);
1414 break;
1415 case INTEL_ENGINE_CLASS_RENDER:
1416 default:
1417 register_addr = GENX(GFX_CCS_AUX_INV_num);
1418 break;
1419 }
1420
1421 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
1422 lri.RegisterOffset = register_addr;
1423 lri.DataDWord = 1;
1424 }
1425
1426 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
1427 if (intel_needs_workaround(device->info, 16018063123) &&
1428 engine_class == INTEL_ENGINE_CLASS_COPY) {
1429 genX(batch_emit_fast_color_dummy_blit)(batch, device);
1430 }
1431
1432 /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1433 *
1434 * "Poll Aux Invalidation bit once the invalidation is set
1435 * (Register 4208 bit 0)"
1436 */
1437 anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
1438 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
1439 sem.WaitMode = PollingMode;
1440 sem.RegisterPollMode = true;
1441 sem.SemaphoreDataDword = 0x0;
1442 sem.SemaphoreAddress =
1443 anv_address_from_u64(register_addr);
1444 }
1445 }
1446 #else
1447 assert(!device->info->has_aux_map);
1448 #endif
1449 }
1450
1451 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1452 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1453 struct anv_device *device,
1454 uint32_t current_pipeline,
1455 enum anv_pipe_bits bits,
1456 enum anv_pipe_bits *emitted_flush_bits)
1457 {
1458 #if GFX_VER >= 12
1459 /* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
1460 *
1461 * "SW must follow below programming restrictions when programming
1462 * PIPE_CONTROL command [for ComputeCS]:
1463 * ...
1464 * Following bits must not be set when programmed for ComputeCS:
1465 * - "Render Target Cache Flush Enable", "Depth Cache Flush Enable"
1466 * and "Tile Cache Flush Enable"
1467 * - "Depth Stall Enable", Stall at Pixel Scoreboard and
1468 * "PSD Sync Enable".
1469 * - "OVR Tile 0 Flush", "TBIMR Force Batch Closure",
1470 * "AMFS Flush Enable", "VF Cache Invalidation Enable" and
1471 * "Global Snapshot Count Reset"."
1472 *
1473 * XXX: According to spec this should not be a concern for a regular
1474 * RCS in GPGPU mode, but during testing it was found that at least
1475 * "VF Cache Invalidation Enable" bit is ignored in such case.
1476 * This can cause us to miss some important invalidations
1477 * (e.g. from CmdPipelineBarriers) and have incoherent data.
1478 *
1479 * There is also a Wa_1606932921 "RCS is not waking up fixed function clock
1480 * when specific 3d related bits are programmed in pipecontrol in
1481 * compute mode" that suggests us not to use "RT Cache Flush" in GPGPU mode.
1482 *
1483 * The other bits are not confirmed to cause problems, but included here
1484 * just to be safe, as they're also not really relevant in the GPGPU mode,
1485 * and having them doesn't seem to cause any regressions.
1486 *
1487 * So if we're currently in GPGPU mode, we hide some bits from
1488 * this flush, and will flush them only when we'll be able to.
1489 * Similar thing with GPGPU-only bits.
1490 */
1491 enum anv_pipe_bits defer_bits = bits &
1492 (current_pipeline == GPGPU ? ANV_PIPE_GFX_BITS: ANV_PIPE_GPGPU_BITS);
1493
1494 bits &= ~defer_bits;
1495 #endif
1496
1497 /*
1498 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1499 *
1500 * Write synchronization is a special case of end-of-pipe
1501 * synchronization that requires that the render cache and/or depth
1502 * related caches are flushed to memory, where the data will become
1503 * globally visible. This type of synchronization is required prior to
1504 * SW (CPU) actually reading the result data from memory, or initiating
1505 * an operation that will use as a read surface (such as a texture
1506 * surface) a previous render target and/or depth/stencil buffer
1507 *
1508 *
1509 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1510 *
1511 * Exercising the write cache flush bits (Render Target Cache Flush
1512 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1513 * ensures the write caches are flushed and doesn't guarantee the data
1514 * is globally visible.
1515 *
1516 * SW can track the completion of the end-of-pipe-synchronization by
1517 * using "Notify Enable" and "PostSync Operation - Write Immediate
1518 * Data" in the PIPE_CONTROL command.
1519 *
1520 * In other words, flushes are pipelined while invalidations are handled
1521 * immediately. Therefore, if we're flushing anything then we need to
1522 * schedule an end-of-pipe sync before any invalidations can happen.
1523 */
1524 if (bits & ANV_PIPE_FLUSH_BITS)
1525 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1526
1527
1528 /* HSD 1209978178: docs say that before programming the aux table:
1529 *
1530 * "Driver must ensure that the engine is IDLE but ensure it doesn't
1531 * add extra flushes in the case it knows that the engine is already
1532 * IDLE."
1533 *
1534 * HSD 22012751911: SW Programming sequence when issuing aux invalidation:
1535 *
1536 * "Render target Cache Flush + L3 Fabric Flush + State Invalidation + CS Stall"
1537 *
1538 * Notice we don't set the L3 Fabric Flush here, because we have
1539 * ANV_PIPE_END_OF_PIPE_SYNC_BIT which inserts a CS stall. The
1540 * PIPE_CONTROL::L3 Fabric Flush documentation says :
1541 *
1542 * "L3 Fabric Flush will ensure all the pending transactions in the L3
1543 * Fabric are flushed to global observation point. HW does implicit L3
1544 * Fabric Flush on all stalling flushes (both explicit and implicit)
1545 * and on PIPECONTROL having Post Sync Operation enabled."
1546 *
1547 * Therefore setting L3 Fabric Flush here would be redundant.
1548 */
1549 if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) {
1550 if (current_pipeline == GPGPU) {
1551 bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
1552 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
1553 (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1554 } else if (current_pipeline == _3D) {
1555 bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
1556 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1557 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1558 (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
1559 }
1560 }
1561
1562 /* If we're going to do an invalidate and we have a pending end-of-pipe
1563 * sync that has yet to be resolved, we do the end-of-pipe sync now.
1564 */
1565 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1566 (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1567 bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1568 bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1569
1570 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
1571 fputs("pc: add ", stderr);
1572 anv_dump_pipe_bits(ANV_PIPE_END_OF_PIPE_SYNC_BIT, stdout);
1573 fprintf(stderr, "reason: Ensure flushes done before invalidate\n");
1574 }
1575 }
1576
1577 /* Project: SKL / Argument: LRI Post Sync Operation [23]
1578 *
1579 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1580 * programmed prior to programming a PIPECONTROL command with "LRI
1581 * Post Sync Operation" in GPGPU mode of operation (i.e when
1582 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
1583 *
1584 * The same text exists a few rows below for Post Sync Op.
1585 */
1586 if (bits & ANV_PIPE_POST_SYNC_BIT) {
1587 if (GFX_VER == 9 && current_pipeline == GPGPU)
1588 bits |= ANV_PIPE_CS_STALL_BIT;
1589 bits &= ~ANV_PIPE_POST_SYNC_BIT;
1590 }
1591
1592 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1593 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1594 enum anv_pipe_bits flush_bits =
1595 bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1596 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1597
1598 #if GFX_VERx10 >= 125
1599 if (current_pipeline != GPGPU) {
1600 if (flush_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
1601 flush_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
1602 } else {
1603 if (flush_bits & (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
1604 ANV_PIPE_DATA_CACHE_FLUSH_BIT))
1605 flush_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
1606 }
1607
1608 /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush:
1609 *
1610 * "'HDC Pipeline Flush' bit must be set for this bit to take
1611 * effect."
1612 */
1613 if (flush_bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
1614 flush_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
1615 #endif
1616
1617 #if GFX_VER < 12
1618 if (flush_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
1619 flush_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
1620 #endif
1621
1622 uint32_t sync_op = NoWrite;
1623 struct anv_address addr = ANV_NULL_ADDRESS;
1624
1625 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1626 *
1627 * "The most common action to perform upon reaching a
1628 * synchronization point is to write a value out to memory. An
1629 * immediate value (included with the synchronization command) may
1630 * be written."
1631 *
1632 *
1633 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1634 *
1635 * "In case the data flushed out by the render engine is to be
1636 * read back in to the render engine in coherent manner, then the
1637 * render engine has to wait for the fence completion before
1638 * accessing the flushed data. This can be achieved by following
1639 * means on various products: PIPE_CONTROL command with CS Stall
1640 * and the required write caches flushed with Post-Sync-Operation
1641 * as Write Immediate Data.
1642 *
1643 * Example:
1644 * - Workload-1 (3D/GPGPU/MEDIA)
1645 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1646 * Immediate Data, Required Write Cache Flush bits set)
1647 * - Workload-2 (Can use the data produce or output by
1648 * Workload-1)
1649 */
1650 if (flush_bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1651 flush_bits |= ANV_PIPE_CS_STALL_BIT;
1652 sync_op = WriteImmediateData;
1653 addr = device->workaround_address;
1654 }
1655
1656 /* Flush PC. */
1657 genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1658 sync_op, addr, 0, flush_bits);
1659
1660 /* If the caller wants to know what flushes have been emitted,
1661 * provide the bits based off the PIPE_CONTROL programmed bits.
1662 */
1663 if (emitted_flush_bits != NULL)
1664 *emitted_flush_bits = flush_bits;
1665
1666 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1667 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1668 }
1669
1670 if (bits & ANV_PIPE_INVALIDATE_BITS) {
1671 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1672 *
1673 * "If the VF Cache Invalidation Enable is set to a 1 in a
1674 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
1675 * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent
1676 * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
1677 * a 1."
1678 *
1679 * This appears to hang Broadwell, so we restrict it to just gfx9.
1680 */
1681 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
1682 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
1683
1684 #if GFX_VER >= 9 && GFX_VER <= 11
1685 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1686 *
1687 * "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
1688 * always set for GPGPU workloads when “Texture Cache
1689 * Invalidation Enable” bit is set".
1690 *
1691 * Workaround stopped appearing in TGL PRMs.
1692 */
1693 if (current_pipeline == GPGPU &&
1694 (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
1695 bits |= ANV_PIPE_CS_STALL_BIT;
1696 #endif
1697
1698 uint32_t sync_op = NoWrite;
1699 struct anv_address addr = ANV_NULL_ADDRESS;
1700
1701 /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
1702 *
1703 * "When VF Cache Invalidate is set “Post Sync Operation” must be
1704 * enabled to “Write Immediate Data” or “Write PS Depth Count” or
1705 * “Write Timestamp”.
1706 */
1707 if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1708 sync_op = WriteImmediateData;
1709 addr = device->workaround_address;
1710 }
1711
1712 /* Invalidate PC. */
1713 genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
1714 sync_op, addr, 0, bits);
1715
1716 enum intel_engine_class engine_class =
1717 current_pipeline == GPGPU ? INTEL_ENGINE_CLASS_COMPUTE :
1718 INTEL_ENGINE_CLASS_RENDER;
1719 genX(invalidate_aux_map)(batch, device, engine_class, bits);
1720
1721 bits &= ~ANV_PIPE_INVALIDATE_BITS;
1722 }
1723
1724 #if GFX_VER >= 12
1725 bits |= defer_bits;
1726 #endif
1727
1728 return bits;
1729 }
1730
1731 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1732 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1733 {
1734 #if INTEL_NEEDS_WA_1508744258
1735 /* If we're changing the state of the RHWO optimization, we need to have
1736 * sb_stall+cs_stall.
1737 */
1738 const bool rhwo_opt_change =
1739 cmd_buffer->state.rhwo_optimization_enabled !=
1740 cmd_buffer->state.pending_rhwo_optimization_enabled;
1741 if (rhwo_opt_change) {
1742 anv_add_pending_pipe_bits(cmd_buffer,
1743 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
1744 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1745 "change RHWO optimization");
1746 }
1747 #endif
1748
1749 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1750
1751 if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1752 bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1753 else if (bits == 0)
1754 return;
1755
1756 if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
1757 anv_cmd_buffer_is_video_queue(cmd_buffer)) {
1758 if (bits & ANV_PIPE_INVALIDATE_BITS) {
1759 genX(invalidate_aux_map)(&cmd_buffer->batch, cmd_buffer->device,
1760 cmd_buffer->queue_family->engine_class, bits);
1761 bits &= ~ANV_PIPE_INVALIDATE_BITS;
1762 }
1763 cmd_buffer->state.pending_pipe_bits = bits;
1764 return;
1765 }
1766
1767 const bool trace_flush =
1768 (bits & (ANV_PIPE_FLUSH_BITS |
1769 ANV_PIPE_STALL_BITS |
1770 ANV_PIPE_INVALIDATE_BITS |
1771 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) != 0;
1772 if (trace_flush)
1773 trace_intel_begin_stall(&cmd_buffer->trace);
1774
1775 if (GFX_VER == 9 &&
1776 (bits & ANV_PIPE_CS_STALL_BIT) &&
1777 (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1778 /* If we are doing a VF cache invalidate AND a CS stall (it must be
1779 * both) then we can reset our vertex cache tracking.
1780 */
1781 memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1782 sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1783 memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1784 sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1785 }
1786
1787
1788 enum anv_pipe_bits emitted_bits = 0;
1789 cmd_buffer->state.pending_pipe_bits =
1790 genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1791 cmd_buffer->device,
1792 cmd_buffer->state.current_pipeline,
1793 bits,
1794 &emitted_bits);
1795 anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits);
1796
1797 #if INTEL_NEEDS_WA_1508744258
1798 if (rhwo_opt_change) {
1799 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1800 c1.RCCRHWOOptimizationDisable =
1801 !cmd_buffer->state.pending_rhwo_optimization_enabled;
1802 c1.RCCRHWOOptimizationDisableMask = true;
1803 }
1804 cmd_buffer->state.rhwo_optimization_enabled =
1805 cmd_buffer->state.pending_rhwo_optimization_enabled;
1806 }
1807 #endif
1808
1809 if (trace_flush) {
1810 trace_intel_end_stall(&cmd_buffer->trace,
1811 bits & ~cmd_buffer->state.pending_pipe_bits,
1812 anv_pipe_flush_bit_to_ds_stall_flag, NULL);
1813 }
1814 }
1815
1816 static inline struct anv_state
emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1817 emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1818 struct anv_cmd_pipeline_state *pipe_state,
1819 struct anv_pipeline_binding *binding,
1820 const struct anv_descriptor *desc)
1821 {
1822 if (!desc->buffer)
1823 return anv_null_surface_state_for_binding_table(cmd_buffer->device);
1824
1825 /* Compute the offset within the buffer */
1826 uint32_t dynamic_offset =
1827 pipe_state->dynamic_offsets[
1828 binding->set].offsets[binding->dynamic_offset_index];
1829 uint64_t offset = desc->offset + dynamic_offset;
1830 /* Clamp to the buffer size */
1831 offset = MIN2(offset, desc->buffer->vk.size);
1832 /* Clamp the range to the buffer size */
1833 uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
1834
1835 /* Align the range for consistency */
1836 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
1837 range = align(range, ANV_UBO_ALIGNMENT);
1838
1839 struct anv_address address =
1840 anv_address_add(desc->buffer->address, offset);
1841
1842 struct anv_state surface_state =
1843 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
1844 if (surface_state.map == NULL)
1845 return ANV_STATE_NULL;
1846
1847 enum isl_format format =
1848 anv_isl_format_for_descriptor_type(cmd_buffer->device,
1849 desc->type);
1850
1851 isl_surf_usage_flags_t usage =
1852 anv_isl_usage_for_descriptor_type(desc->type);
1853
1854 anv_fill_buffer_surface_state(cmd_buffer->device,
1855 surface_state.map,
1856 format, ISL_SWIZZLE_IDENTITY,
1857 usage, address, range, 1);
1858
1859 return surface_state;
1860 }
1861
1862 static uint32_t
emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1863 emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1864 struct anv_cmd_pipeline_state *pipe_state,
1865 struct anv_pipeline_binding *binding,
1866 const struct anv_descriptor *desc)
1867 {
1868 struct anv_device *device = cmd_buffer->device;
1869 struct anv_state surface_state;
1870
1871 /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
1872 * Depending on where the descriptor surface state is allocated, they can
1873 * either come from device->internal_surface_state_pool or
1874 * device->bindless_surface_state_pool.
1875 */
1876 switch (desc->type) {
1877 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1878 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1879 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
1880 if (desc->image_view) {
1881 const struct anv_surface_state *sstate =
1882 anv_image_view_texture_surface_state(desc->image_view,
1883 binding->plane,
1884 desc->layout);
1885 surface_state = desc->image_view->use_surface_state_stream ?
1886 sstate->state :
1887 anv_bindless_state_for_binding_table(device, sstate->state);
1888 assert(surface_state.alloc_size);
1889 } else {
1890 surface_state = anv_null_surface_state_for_binding_table(device);
1891 }
1892 break;
1893 }
1894
1895 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
1896 if (desc->image_view) {
1897 const struct anv_surface_state *sstate =
1898 anv_image_view_storage_surface_state(desc->image_view);
1899 surface_state = desc->image_view->use_surface_state_stream ?
1900 sstate->state :
1901 anv_bindless_state_for_binding_table(device, sstate->state);
1902 assert(surface_state.alloc_size);
1903 } else {
1904 surface_state =
1905 anv_null_surface_state_for_binding_table(device);
1906 }
1907 break;
1908 }
1909
1910 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
1911 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
1912 if (desc->set_buffer_view) {
1913 surface_state = desc->set_buffer_view->general.state;
1914 assert(surface_state.alloc_size);
1915 } else {
1916 surface_state = anv_null_surface_state_for_binding_table(device);
1917 }
1918 break;
1919
1920 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
1921 if (desc->buffer_view) {
1922 surface_state = anv_bindless_state_for_binding_table(
1923 device,
1924 desc->buffer_view->general.state);
1925 assert(surface_state.alloc_size);
1926 } else {
1927 surface_state = anv_null_surface_state_for_binding_table(device);
1928 }
1929 break;
1930
1931 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
1932 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
1933 surface_state =
1934 emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
1935 binding, desc);
1936 break;
1937
1938 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
1939 if (desc->buffer_view) {
1940 surface_state = anv_bindless_state_for_binding_table(
1941 device, desc->buffer_view->storage.state);
1942 assert(surface_state.alloc_size);
1943 } else {
1944 surface_state = anv_null_surface_state_for_binding_table(device);
1945 }
1946 break;
1947
1948 default:
1949 unreachable("Invalid descriptor type");
1950 }
1951
1952 return surface_state.offset;
1953 }
1954
1955 static uint32_t
emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const struct anv_descriptor_set * set,struct anv_pipeline_binding * binding,const struct anv_descriptor * desc)1956 emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
1957 struct anv_cmd_pipeline_state *pipe_state,
1958 const struct anv_descriptor_set *set,
1959 struct anv_pipeline_binding *binding,
1960 const struct anv_descriptor *desc)
1961 {
1962 uint32_t desc_offset;
1963
1964 /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
1965 * Depending on where the descriptor surface state is allocated, they can
1966 * either come from device->internal_surface_state_pool or
1967 * device->bindless_surface_state_pool.
1968 */
1969 switch (desc->type) {
1970 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1971 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1972 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
1973 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
1974 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
1975 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
1976 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
1977 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
1978 desc_offset = set->desc_offset + binding->set_offset;
1979 break;
1980
1981 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
1982 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
1983 struct anv_state state =
1984 emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
1985 binding, desc);
1986 desc_offset = state.offset;
1987 break;
1988 }
1989
1990 default:
1991 unreachable("Invalid descriptor type");
1992 }
1993
1994 return desc_offset;
1995 }
1996
1997 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)1998 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
1999 struct anv_cmd_pipeline_state *pipe_state,
2000 struct anv_shader_bin *shader,
2001 struct anv_state *bt_state)
2002 {
2003 uint32_t state_offset;
2004
2005 struct anv_pipeline_bind_map *map = &shader->bind_map;
2006 if (map->surface_count == 0) {
2007 *bt_state = (struct anv_state) { 0, };
2008 return VK_SUCCESS;
2009 }
2010
2011 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2012 map->surface_count,
2013 &state_offset);
2014 uint32_t *bt_map = bt_state->map;
2015
2016 if (bt_state->map == NULL)
2017 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2018
2019 for (uint32_t s = 0; s < map->surface_count; s++) {
2020 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2021
2022 struct anv_state surface_state;
2023
2024 switch (binding->set) {
2025 case ANV_DESCRIPTOR_SET_NULL:
2026 bt_map[s] = 0;
2027 break;
2028
2029 case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2030 /* Color attachment binding */
2031 assert(shader->stage == MESA_SHADER_FRAGMENT);
2032 if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2033 const struct anv_attachment *att =
2034 &cmd_buffer->state.gfx.color_att[binding->index];
2035 surface_state = att->surface_state.state;
2036 } else {
2037 surface_state = cmd_buffer->state.gfx.null_surface_state;
2038 }
2039 assert(surface_state.map);
2040 bt_map[s] = surface_state.offset + state_offset;
2041 break;
2042
2043 case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2044 /* This is always the first binding for compute shaders */
2045 assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2046
2047 struct anv_state surface_state =
2048 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2049 if (surface_state.map == NULL)
2050 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2051
2052 const enum isl_format format =
2053 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2054 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2055 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map,
2056 format, ISL_SWIZZLE_IDENTITY,
2057 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2058 cmd_buffer->state.compute.num_workgroups,
2059 12, 1);
2060
2061 assert(surface_state.map);
2062 bt_map[s] = surface_state.offset + state_offset;
2063 break;
2064 }
2065
2066 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2067 struct anv_descriptor_set *set =
2068 pipe_state->descriptors[binding->index];
2069
2070 /* If the shader doesn't access the set buffer, just put the null
2071 * surface.
2072 */
2073 if (set->is_push && !shader->push_desc_info.used_set_buffer) {
2074 bt_map[s] = 0;
2075 break;
2076 }
2077
2078 /* This is a descriptor set buffer so the set index is actually
2079 * given by binding->binding. (Yes, that's confusing.)
2080 */
2081 assert(set->desc_surface_mem.alloc_size);
2082 assert(set->desc_surface_state.alloc_size);
2083 bt_map[s] = set->desc_surface_state.offset + state_offset;
2084 add_surface_reloc(cmd_buffer, anv_descriptor_set_address(set));
2085 break;
2086 }
2087
2088 default: {
2089 assert(binding->set < MAX_SETS);
2090 const struct anv_descriptor_set *set =
2091 pipe_state->descriptors[binding->set];
2092
2093 if (binding->index >= set->descriptor_count) {
2094 /* From the Vulkan spec section entitled "DescriptorSet and
2095 * Binding Assignment":
2096 *
2097 * "If the array is runtime-sized, then array elements greater
2098 * than or equal to the size of that binding in the bound
2099 * descriptor set must not be used."
2100 *
2101 * Unfortunately, the compiler isn't smart enough to figure out
2102 * when a dynamic binding isn't used so it may grab the whole
2103 * array and stick it in the binding table. In this case, it's
2104 * safe to just skip those bindings that are OOB.
2105 */
2106 assert(binding->index < set->layout->descriptor_count);
2107 continue;
2108 }
2109
2110 /* For push descriptor, if the binding is fully promoted to push
2111 * constants, just reference the null surface in the binding table.
2112 * It's unused and we didn't allocate/pack a surface state for it .
2113 */
2114 if (set->is_push) {
2115 uint32_t desc_idx = set->layout->binding[binding->binding].descriptor_index;
2116 assert(desc_idx < MAX_PUSH_DESCRIPTORS);
2117
2118 if (shader->push_desc_info.fully_promoted_ubo_descriptors & BITFIELD_BIT(desc_idx)) {
2119 surface_state =
2120 anv_null_surface_state_for_binding_table(cmd_buffer->device);
2121 break;
2122 }
2123 }
2124
2125 const struct anv_descriptor *desc = &set->descriptors[binding->index];
2126 if (desc->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR ||
2127 desc->type == VK_DESCRIPTOR_TYPE_SAMPLER) {
2128 /* Nothing for us to do here */
2129 continue;
2130 }
2131
2132 const struct anv_pipeline *pipeline = pipe_state->pipeline;
2133 uint32_t surface_state_offset;
2134 if (pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
2135 surface_state_offset =
2136 emit_indirect_descriptor_binding_table_entry(cmd_buffer,
2137 pipe_state,
2138 binding, desc);
2139 } else {
2140 surface_state_offset =
2141 emit_direct_descriptor_binding_table_entry(cmd_buffer, pipe_state,
2142 set, binding, desc);
2143 }
2144
2145 bt_map[s] = surface_state_offset + state_offset;
2146 break;
2147 }
2148 }
2149 }
2150
2151 return VK_SUCCESS;
2152 }
2153
2154 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2155 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2156 struct anv_cmd_pipeline_state *pipe_state,
2157 struct anv_shader_bin *shader,
2158 struct anv_state *state)
2159 {
2160 struct anv_pipeline_bind_map *map = &shader->bind_map;
2161 if (map->sampler_count == 0) {
2162 *state = (struct anv_state) { 0, };
2163 return VK_SUCCESS;
2164 }
2165
2166 uint32_t size = map->sampler_count * 16;
2167 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2168
2169 if (state->map == NULL)
2170 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2171
2172 for (uint32_t s = 0; s < map->sampler_count; s++) {
2173 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2174 const struct anv_descriptor *desc =
2175 &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2176
2177 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2178 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2179 continue;
2180
2181 struct anv_sampler *sampler = desc->sampler;
2182
2183 /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2184 * happens to be zero.
2185 */
2186 if (sampler == NULL)
2187 continue;
2188
2189 memcpy(state->map + (s * 16),
2190 sampler->state[binding->plane], sizeof(sampler->state[0]));
2191 }
2192
2193 return VK_SUCCESS;
2194 }
2195
2196 uint32_t
genX(cmd_buffer_flush_descriptor_sets)2197 genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
2198 struct anv_cmd_pipeline_state *pipe_state,
2199 const VkShaderStageFlags dirty,
2200 struct anv_shader_bin **shaders,
2201 uint32_t num_shaders)
2202 {
2203 VkShaderStageFlags flushed = 0;
2204
2205 VkResult result = VK_SUCCESS;
2206 for (uint32_t i = 0; i < num_shaders; i++) {
2207 if (!shaders[i])
2208 continue;
2209
2210 gl_shader_stage stage = shaders[i]->stage;
2211 VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2212 if ((vk_stage & dirty) == 0)
2213 continue;
2214
2215 assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2216 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2217 &cmd_buffer->state.samplers[stage]);
2218 if (result != VK_SUCCESS)
2219 break;
2220
2221 assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2222 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2223 &cmd_buffer->state.binding_tables[stage]);
2224 if (result != VK_SUCCESS)
2225 break;
2226
2227 flushed |= vk_stage;
2228 }
2229
2230 if (result != VK_SUCCESS) {
2231 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2232
2233 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2234 if (result != VK_SUCCESS)
2235 return 0;
2236
2237 /* Re-emit state base addresses so we get the new surface state base
2238 * address before we start emitting binding tables etc.
2239 */
2240 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2241
2242 /* Re-emit all active binding tables */
2243 flushed = 0;
2244
2245 for (uint32_t i = 0; i < num_shaders; i++) {
2246 if (!shaders[i])
2247 continue;
2248
2249 gl_shader_stage stage = shaders[i]->stage;
2250
2251 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2252 &cmd_buffer->state.samplers[stage]);
2253 if (result != VK_SUCCESS) {
2254 anv_batch_set_error(&cmd_buffer->batch, result);
2255 return 0;
2256 }
2257 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2258 &cmd_buffer->state.binding_tables[stage]);
2259 if (result != VK_SUCCESS) {
2260 anv_batch_set_error(&cmd_buffer->batch, result);
2261 return 0;
2262 }
2263
2264 flushed |= mesa_to_vk_shader_stage(stage);
2265 }
2266 }
2267
2268 return flushed;
2269 }
2270
2271 /* This function generates the surface state used to read the content of the
2272 * descriptor buffer.
2273 */
2274 void
genX(cmd_buffer_emit_push_descriptor_buffer_surface)2275 genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
2276 struct anv_descriptor_set *set)
2277 {
2278 assert(set->desc_surface_state.map == NULL);
2279
2280 struct anv_descriptor_set_layout *layout = set->layout;
2281 enum isl_format format =
2282 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2283 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2284
2285 set->desc_surface_state =
2286 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2287 if (set->desc_surface_state.map == NULL)
2288 return;
2289 anv_fill_buffer_surface_state(cmd_buffer->device,
2290 set->desc_surface_state.map,
2291 format, ISL_SWIZZLE_IDENTITY,
2292 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2293 set->desc_surface_addr,
2294 layout->descriptor_buffer_surface_size, 1);
2295 }
2296
2297 /* This functions generates surface states used by a pipeline for push
2298 * descriptors. This is delayed to the draw/dispatch time to avoid allocation
2299 * and surface state generation when a pipeline is not going to use the
2300 * binding table to access any push descriptor data.
2301 */
2302 void
genX(cmd_buffer_emit_push_descriptor_surfaces)2303 genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
2304 struct anv_descriptor_set *set)
2305 {
2306 while (set->generate_surface_states) {
2307 int desc_idx = u_bit_scan(&set->generate_surface_states);
2308 struct anv_descriptor *desc = &set->descriptors[desc_idx];
2309 struct anv_buffer_view *bview = desc->set_buffer_view;
2310
2311 if (bview != NULL && bview->general.state.map == NULL) {
2312 bview->general.state =
2313 anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
2314 if (bview->general.state.map == NULL)
2315 return;
2316 anv_descriptor_write_surface_state(cmd_buffer->device, desc,
2317 bview->general.state);
2318 }
2319 }
2320 }
2321
2322 ALWAYS_INLINE void
genX(batch_emit_pipe_control)2323 genX(batch_emit_pipe_control)(struct anv_batch *batch,
2324 const struct intel_device_info *devinfo,
2325 uint32_t current_pipeline,
2326 enum anv_pipe_bits bits,
2327 const char *reason)
2328 {
2329 genX(batch_emit_pipe_control_write)(batch,
2330 devinfo,
2331 current_pipeline,
2332 NoWrite,
2333 ANV_NULL_ADDRESS,
2334 0,
2335 bits,
2336 reason);
2337 }
2338
2339 ALWAYS_INLINE void
genX(batch_emit_pipe_control_write)2340 genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
2341 const struct intel_device_info *devinfo,
2342 uint32_t current_pipeline,
2343 uint32_t post_sync_op,
2344 struct anv_address address,
2345 uint32_t imm_data,
2346 enum anv_pipe_bits bits,
2347 const char *reason)
2348 {
2349 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
2350 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
2351 unreachable("Trying to emit unsupported PIPE_CONTROL command.");
2352
2353 /* XXX - insert all workarounds and GFX specific things below. */
2354
2355 /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
2356 * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
2357 * with CS_STALL Bit set (with No POST_SYNC ENABLED)
2358 */
2359 if (intel_device_info_is_adln(devinfo) &&
2360 current_pipeline == GPGPU &&
2361 post_sync_op != NoWrite) {
2362 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2363 pipe.CommandStreamerStallEnable = true;
2364 anv_debug_dump_pc(pipe, "Wa_14014966230");
2365 };
2366 }
2367
2368 #if INTEL_NEEDS_WA_1409600907
2369 /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
2370 * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
2371 */
2372 if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
2373 bits |= ANV_PIPE_DEPTH_STALL_BIT;
2374 #endif
2375
2376 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
2377 #if GFX_VERx10 >= 125
2378 pipe.UntypedDataPortCacheFlushEnable =
2379 bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
2380 pipe.CCSFlushEnable = bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT;
2381 #endif
2382 #if GFX_VER == 12
2383 pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
2384 #endif
2385 #if GFX_VER > 11
2386 pipe.HDCPipelineFlushEnable = bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
2387 #endif
2388 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
2389 pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
2390 pipe.RenderTargetCacheFlushEnable =
2391 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
2392
2393 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
2394
2395 #if GFX_VERx10 >= 125
2396 pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
2397 #endif
2398 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
2399 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
2400
2401 pipe.StateCacheInvalidationEnable =
2402 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
2403 pipe.ConstantCacheInvalidationEnable =
2404 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
2405 #if GFX_VER >= 12
2406 /* Invalidates the L3 cache part in which index & vertex data is loaded
2407 * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
2408 */
2409 pipe.L3ReadOnlyCacheInvalidationEnable =
2410 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2411 #endif
2412 pipe.VFCacheInvalidationEnable =
2413 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
2414 pipe.TextureCacheInvalidationEnable =
2415 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
2416 pipe.InstructionCacheInvalidateEnable =
2417 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
2418
2419 pipe.PostSyncOperation = post_sync_op;
2420 pipe.Address = address;
2421 pipe.DestinationAddressType = DAT_PPGTT;
2422 pipe.ImmediateData = imm_data;
2423
2424 anv_debug_dump_pc(pipe, reason);
2425 }
2426 }
2427
2428 /* Set preemption on/off. */
2429 void
genX(batch_set_preemption)2430 genX(batch_set_preemption)(struct anv_batch *batch,
2431 const struct intel_device_info *devinfo,
2432 uint32_t current_pipeline,
2433 bool value)
2434 {
2435 #if GFX_VERx10 >= 120
2436 anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
2437 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !value;
2438 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
2439 }
2440
2441 /* Wa_16013994831 - we need to insert CS_STALL and 250 noops. */
2442 genx_batch_emit_pipe_control(batch, devinfo, current_pipeline,
2443 ANV_PIPE_CS_STALL_BIT);
2444
2445 for (unsigned i = 0; i < 250; i++)
2446 anv_batch_emit(batch, GENX(MI_NOOP), noop);
2447 #endif
2448 }
2449
2450 void
genX(cmd_buffer_set_preemption)2451 genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value)
2452 {
2453 #if GFX_VERx10 >= 120
2454 if (cmd_buffer->state.gfx.object_preemption == value)
2455 return;
2456
2457 genX(batch_set_preemption)(&cmd_buffer->batch, cmd_buffer->device->info,
2458 cmd_buffer->state.current_pipeline,
2459 value);
2460 cmd_buffer->state.gfx.object_preemption = value;
2461 #endif
2462 }
2463
2464 VkResult
genX(BeginCommandBuffer)2465 genX(BeginCommandBuffer)(
2466 VkCommandBuffer commandBuffer,
2467 const VkCommandBufferBeginInfo* pBeginInfo)
2468 {
2469 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2470 VkResult result;
2471
2472 /* If this is the first vkBeginCommandBuffer, we must *initialize* the
2473 * command buffer's state. Otherwise, we must *reset* its state. In both
2474 * cases we reset it.
2475 *
2476 * From the Vulkan 1.0 spec:
2477 *
2478 * If a command buffer is in the executable state and the command buffer
2479 * was allocated from a command pool with the
2480 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
2481 * vkBeginCommandBuffer implicitly resets the command buffer, behaving
2482 * as if vkResetCommandBuffer had been called with
2483 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
2484 * the command buffer in the recording state.
2485 */
2486 anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
2487 anv_cmd_buffer_reset_rendering(cmd_buffer);
2488
2489 cmd_buffer->usage_flags = pBeginInfo->flags;
2490
2491 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
2492 * primary level command buffers.
2493 *
2494 * From the Vulkan 1.0 spec:
2495 *
2496 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
2497 * secondary command buffer is considered to be entirely inside a render
2498 * pass. If this is a primary command buffer, then this bit is ignored.
2499 */
2500 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
2501 cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2502
2503 #if GFX_VER >= 12
2504 /* Reenable prefetching at the beginning of secondary command buffers. We
2505 * do this so that the return instruction edition is not prefetched before
2506 * completion.
2507 */
2508 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2509 anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
2510 arb.PreParserDisableMask = true;
2511 arb.PreParserDisable = false;
2512 }
2513 }
2514 #endif
2515
2516 /* Assume the viewport has already been set in primary command buffers. */
2517 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
2518 cmd_buffer->state.gfx.viewport_set = true;
2519
2520 trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
2521
2522 if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
2523 anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
2524 /* Re-emit the aux table register in every command buffer. This way we're
2525 * ensured that we have the table even if this command buffer doesn't
2526 * initialize any images.
2527 */
2528 if (cmd_buffer->device->info->has_aux_map) {
2529 anv_add_pending_pipe_bits(cmd_buffer,
2530 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
2531 "new cmd buffer with aux-tt");
2532 }
2533 return VK_SUCCESS;
2534 }
2535
2536 #if GFX_VER >= 12
2537 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
2538 cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) {
2539 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SET_APPID), appid) {
2540 /* Default value for single session. */
2541 appid.ProtectedMemoryApplicationID = cmd_buffer->device->protected_session_id;
2542 appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
2543 }
2544 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2545 pc.CommandStreamerStallEnable = true;
2546 pc.DCFlushEnable = true;
2547 pc.RenderTargetCacheFlushEnable = true;
2548 pc.ProtectedMemoryEnable = true;
2549 }
2550 }
2551 #endif
2552
2553 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2554
2555 /* We sometimes store vertex data in the dynamic state buffer for blorp
2556 * operations and our dynamic state stream may re-use data from previous
2557 * command buffers. In order to prevent stale cache data, we flush the VF
2558 * cache. We could do this on every blorp call but that's not really
2559 * needed as all of the data will get written by the CPU prior to the GPU
2560 * executing anything. The chances are fairly high that they will use
2561 * blorp at least once per primary command buffer so it shouldn't be
2562 * wasted.
2563 *
2564 * There is also a workaround on gfx8 which requires us to invalidate the
2565 * VF cache occasionally. It's easier if we can assume we start with a
2566 * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
2567 */
2568 anv_add_pending_pipe_bits(cmd_buffer,
2569 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
2570 "new cmd buffer");
2571
2572 /* Re-emit the aux table register in every command buffer. This way we're
2573 * ensured that we have the table even if this command buffer doesn't
2574 * initialize any images.
2575 */
2576 if (cmd_buffer->device->info->has_aux_map) {
2577 anv_add_pending_pipe_bits(cmd_buffer,
2578 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
2579 "new cmd buffer with aux-tt");
2580 }
2581
2582 /* We send an "Indirect State Pointers Disable" packet at
2583 * EndCommandBuffer, so all push constant packets are ignored during a
2584 * context restore. Documentation says after that command, we need to
2585 * emit push constants again before any rendering operation. So we
2586 * flag them dirty here to make sure they get emitted.
2587 */
2588 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2589
2590 if (cmd_buffer->usage_flags &
2591 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2592 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2593
2594 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
2595 const VkRenderingInfo *resume_info =
2596 vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
2597 pBeginInfo,
2598 gcbiar_data);
2599 if (resume_info != NULL) {
2600 genX(CmdBeginRendering)(commandBuffer, resume_info);
2601 } else {
2602 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
2603 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
2604 pBeginInfo);
2605 assert(inheritance_info);
2606
2607 gfx->rendering_flags = inheritance_info->flags;
2608 gfx->render_area = (VkRect2D) { };
2609 gfx->layer_count = 0;
2610 gfx->samples = inheritance_info->rasterizationSamples;
2611 gfx->view_mask = inheritance_info->viewMask;
2612
2613 uint32_t color_att_count = inheritance_info->colorAttachmentCount;
2614 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
2615 if (result != VK_SUCCESS)
2616 return result;
2617
2618 for (uint32_t i = 0; i < color_att_count; i++) {
2619 gfx->color_att[i].vk_format =
2620 inheritance_info->pColorAttachmentFormats[i];
2621 }
2622 gfx->depth_att.vk_format =
2623 inheritance_info->depthAttachmentFormat;
2624 gfx->stencil_att.vk_format =
2625 inheritance_info->stencilAttachmentFormat;
2626
2627 anv_cmd_graphic_state_update_has_uint_rt(gfx);
2628
2629 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_AREA |
2630 ANV_CMD_DIRTY_RENDER_TARGETS;
2631 }
2632 }
2633
2634 /* Emit the sample pattern at the beginning of the batch because the
2635 * default locations emitted at the device initialization might have been
2636 * changed by a previous command buffer.
2637 *
2638 * Do not change that when we're continuing a previous renderpass.
2639 */
2640 if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
2641 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
2642 genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
2643
2644 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2645 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
2646 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
2647
2648 /* If secondary buffer supports conditional rendering
2649 * we should emit commands as if conditional rendering is enabled.
2650 */
2651 cmd_buffer->state.conditional_render_enabled =
2652 conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
2653
2654 if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable) {
2655 cmd_buffer->state.gfx.n_occlusion_queries = 1;
2656 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
2657 }
2658 }
2659
2660 return VK_SUCCESS;
2661 }
2662
2663 /* From the PRM, Volume 2a:
2664 *
2665 * "Indirect State Pointers Disable
2666 *
2667 * At the completion of the post-sync operation associated with this pipe
2668 * control packet, the indirect state pointers in the hardware are
2669 * considered invalid; the indirect pointers are not saved in the context.
2670 * If any new indirect state commands are executed in the command stream
2671 * while the pipe control is pending, the new indirect state commands are
2672 * preserved.
2673 *
2674 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
2675 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
2676 * commands are only considered as Indirect State Pointers. Once ISP is
2677 * issued in a context, SW must initialize by programming push constant
2678 * commands for all the shaders (at least to zero length) before attempting
2679 * any rendering operation for the same context."
2680 *
2681 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
2682 * even though they point to a BO that has been already unreferenced at
2683 * the end of the previous batch buffer. This has been fine so far since
2684 * we are protected by these scratch page (every address not covered by
2685 * a BO should be pointing to the scratch page). But on CNL, it is
2686 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
2687 * instruction.
2688 *
2689 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
2690 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
2691 * context restore, so the mentioned hang doesn't happen. However,
2692 * software must program push constant commands for all stages prior to
2693 * rendering anything. So we flag them dirty in BeginCommandBuffer.
2694 *
2695 * Finally, we also make sure to stall at pixel scoreboard to make sure the
2696 * constants have been loaded into the EUs prior to disable the push constants
2697 * so that it doesn't hang a previous 3DPRIMITIVE.
2698 */
2699 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)2700 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
2701 {
2702 genx_batch_emit_pipe_control(&cmd_buffer->batch,
2703 cmd_buffer->device->info,
2704 cmd_buffer->state.current_pipeline,
2705 ANV_PIPE_CS_STALL_BIT |
2706 ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
2707 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2708 pc.IndirectStatePointersDisable = true;
2709 pc.CommandStreamerStallEnable = true;
2710 anv_debug_dump_pc(pc, __func__);
2711 }
2712 }
2713
2714 static VkResult
end_command_buffer(struct anv_cmd_buffer * cmd_buffer)2715 end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
2716 {
2717 if (anv_batch_has_error(&cmd_buffer->batch))
2718 return cmd_buffer->batch.status;
2719
2720 anv_measure_endcommandbuffer(cmd_buffer);
2721
2722 if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
2723 anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
2724 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
2725 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2726 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
2727 return VK_SUCCESS;
2728 }
2729
2730 /* Flush query clears using blorp so that secondary query writes do not
2731 * race with the clear.
2732 */
2733 if (cmd_buffer->state.queries.clear_bits) {
2734 anv_add_pending_pipe_bits(cmd_buffer,
2735 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
2736 "query clear flush prior command buffer end");
2737 }
2738
2739 genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
2740
2741 /* Turn on object level preemption if it is disabled to have it in known
2742 * state at the beginning of new command buffer.
2743 */
2744 if (!cmd_buffer->state.gfx.object_preemption)
2745 genX(cmd_buffer_set_preemption)(cmd_buffer, true);
2746
2747 /* We want every command buffer to start with the PMA fix in a known state,
2748 * so we disable it at the end of the command buffer.
2749 */
2750 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
2751
2752 /* Wa_14015814527
2753 *
2754 * Apply task URB workaround in the end of primary or secondary cmd_buffer.
2755 */
2756 genX(apply_task_urb_workaround)(cmd_buffer);
2757
2758 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2759
2760 emit_isp_disable(cmd_buffer);
2761
2762 #if GFX_VER >= 12
2763 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
2764 cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) {
2765 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2766 pc.CommandStreamerStallEnable = true;
2767 pc.DCFlushEnable = true;
2768 pc.RenderTargetCacheFlushEnable = true;
2769 pc.ProtectedMemoryDisable = true;
2770 }
2771 }
2772 #endif
2773
2774 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
2775
2776 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
2777
2778 return VK_SUCCESS;
2779 }
2780
2781 VkResult
genX(EndCommandBuffer)2782 genX(EndCommandBuffer)(
2783 VkCommandBuffer commandBuffer)
2784 {
2785 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2786
2787 VkResult status = end_command_buffer(cmd_buffer);
2788 if (status != VK_SUCCESS)
2789 return status;
2790
2791 /* If there is MSAA access over the compute/transfer queue, we can use the
2792 * companion RCS command buffer and end it properly.
2793 */
2794 if (cmd_buffer->companion_rcs_cmd_buffer) {
2795 assert(anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
2796 anv_cmd_buffer_is_blitter_queue(cmd_buffer));
2797 status = end_command_buffer(cmd_buffer->companion_rcs_cmd_buffer);
2798 }
2799
2800 ANV_RMV(cmd_buffer_create, cmd_buffer->device, cmd_buffer);
2801
2802 return status;
2803 }
2804
2805 static void
cmd_buffer_emit_copy_ts_buffer(struct u_trace_context * utctx,void * cmdstream,void * ts_from,uint32_t from_offset,void * ts_to,uint32_t to_offset,uint32_t count)2806 cmd_buffer_emit_copy_ts_buffer(struct u_trace_context *utctx,
2807 void *cmdstream,
2808 void *ts_from, uint32_t from_offset,
2809 void *ts_to, uint32_t to_offset,
2810 uint32_t count)
2811 {
2812 struct anv_memcpy_state *memcpy_state = cmdstream;
2813 struct anv_address from_addr = (struct anv_address) {
2814 .bo = ts_from, .offset = from_offset * sizeof(uint64_t) };
2815 struct anv_address to_addr = (struct anv_address) {
2816 .bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
2817
2818 genX(emit_so_memcpy)(memcpy_state, to_addr, from_addr,
2819 count * sizeof(uint64_t));
2820 }
2821
2822 void
genX(CmdExecuteCommands)2823 genX(CmdExecuteCommands)(
2824 VkCommandBuffer commandBuffer,
2825 uint32_t commandBufferCount,
2826 const VkCommandBuffer* pCmdBuffers)
2827 {
2828 ANV_FROM_HANDLE(anv_cmd_buffer, container, commandBuffer);
2829
2830 struct anv_device *device = container->device;
2831
2832 if (anv_batch_has_error(&container->batch))
2833 return;
2834
2835 /* The secondary command buffers will assume that the PMA fix is disabled
2836 * when they begin executing. Make sure this is true.
2837 */
2838 genX(cmd_buffer_enable_pma_fix)(container, false);
2839
2840 /* Turn on preemption in case it was toggled off. */
2841 if (!container->state.gfx.object_preemption)
2842 genX(cmd_buffer_set_preemption)(container, true);
2843
2844 /* Wa_14015814527
2845 *
2846 * Apply task URB workaround before secondary cmd buffers.
2847 */
2848 genX(apply_task_urb_workaround)(container);
2849
2850 /* Flush query clears using blorp so that secondary query writes do not
2851 * race with the clear.
2852 */
2853 if (container->state.queries.clear_bits) {
2854 anv_add_pending_pipe_bits(container,
2855 ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits),
2856 "query clear flush prior to secondary buffer");
2857 }
2858
2859 /* The secondary command buffer doesn't know which textures etc. have been
2860 * flushed prior to their execution. Apply those flushes now.
2861 */
2862 genX(cmd_buffer_apply_pipe_flushes)(container);
2863
2864 genX(cmd_buffer_flush_generated_draws)(container);
2865
2866 for (uint32_t i = 0; i < commandBufferCount; i++) {
2867 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
2868
2869 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
2870 assert(!anv_batch_has_error(&secondary->batch));
2871
2872 if (secondary->state.conditional_render_enabled) {
2873 if (!container->state.conditional_render_enabled) {
2874 /* Secondary buffer is constructed as if it will be executed
2875 * with conditional rendering, we should satisfy this dependency
2876 * regardless of conditional rendering being enabled in container.
2877 */
2878 struct mi_builder b;
2879 mi_builder_init(&b, device->info, &container->batch);
2880 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
2881 mi_imm(UINT64_MAX));
2882 }
2883 }
2884
2885 if (secondary->usage_flags &
2886 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2887 /* If we're continuing a render pass from the container, we need to
2888 * copy the surface states for the current subpass into the storage
2889 * we allocated for them in BeginCommandBuffer.
2890 */
2891 struct anv_state src_state = container->state.gfx.att_states;
2892 struct anv_state dst_state = secondary->state.gfx.att_states;
2893 assert(src_state.alloc_size == dst_state.alloc_size);
2894
2895 genX(cmd_buffer_so_memcpy)(
2896 container,
2897 anv_state_pool_state_address(&device->internal_surface_state_pool,
2898 dst_state),
2899 anv_state_pool_state_address(&device->internal_surface_state_pool,
2900 src_state),
2901 src_state.alloc_size);
2902 }
2903
2904 anv_cmd_buffer_add_secondary(container, secondary);
2905
2906 /* Add secondary buffer's RCS command buffer to container buffer's RCS
2907 * command buffer for execution if secondary RCS is valid.
2908 */
2909 if (secondary->companion_rcs_cmd_buffer != NULL) {
2910 VkResult result = anv_cmd_buffer_ensure_rcs_companion(container);
2911 if (result != VK_SUCCESS) {
2912 anv_batch_set_error(&container->batch, result);
2913 return;
2914 }
2915
2916 anv_cmd_buffer_add_secondary(container->companion_rcs_cmd_buffer,
2917 secondary->companion_rcs_cmd_buffer);
2918 }
2919
2920 assert(secondary->perf_query_pool == NULL || container->perf_query_pool == NULL ||
2921 secondary->perf_query_pool == container->perf_query_pool);
2922 if (secondary->perf_query_pool)
2923 container->perf_query_pool = secondary->perf_query_pool;
2924
2925 #if INTEL_NEEDS_WA_1808121037
2926 if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
2927 container->state.depth_reg_mode = secondary->state.depth_reg_mode;
2928 #endif
2929
2930 container->state.gfx.viewport_set |= secondary->state.gfx.viewport_set;
2931 }
2932
2933 /* The secondary isn't counted in our VF cache tracking so we need to
2934 * invalidate the whole thing.
2935 */
2936 if (GFX_VER == 9) {
2937 anv_add_pending_pipe_bits(container,
2938 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
2939 "Secondary cmd buffer not tracked in VF cache");
2940 }
2941
2942 #if INTEL_WA_16014538804_GFX_VER
2943 if (anv_cmd_buffer_is_render_queue(container) &&
2944 intel_needs_workaround(device->info, 16014538804))
2945 anv_batch_emit(&container->batch, GENX(PIPE_CONTROL), pc);
2946 #endif
2947
2948 /* The secondary may have selected a different pipeline (3D or compute) and
2949 * may have changed the current L3$ configuration. Reset our tracking
2950 * variables to invalid values to ensure that we re-emit these in the case
2951 * where we do any draws or compute dispatches from the container after the
2952 * secondary has returned.
2953 */
2954 container->state.current_pipeline = UINT32_MAX;
2955 container->state.current_l3_config = NULL;
2956 container->state.current_hash_scale = 0;
2957 container->state.gfx.push_constant_stages = 0;
2958 container->state.gfx.ds_write_state = false;
2959 memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config));
2960 memcpy(container->state.gfx.dyn_state.dirty,
2961 device->gfx_dirty_state,
2962 sizeof(container->state.gfx.dyn_state.dirty));
2963
2964 /* Each of the secondary command buffers will use its own state base
2965 * address. We need to re-emit state base address for the container after
2966 * all of the secondaries are done.
2967 *
2968 * TODO: Maybe we want to make this a dirty bit to avoid extra state base
2969 * address calls?
2970 */
2971 genX(cmd_buffer_emit_state_base_address)(container);
2972
2973 /* Copy of utrace timestamp buffers from secondary into container */
2974 if (u_trace_enabled(&device->ds.trace_context)) {
2975 trace_intel_begin_trace_copy(&container->trace);
2976
2977 struct anv_memcpy_state memcpy_state;
2978 genX(emit_so_memcpy_init)(&memcpy_state, device, &container->batch);
2979 uint32_t num_traces = 0;
2980 for (uint32_t i = 0; i < commandBufferCount; i++) {
2981 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
2982
2983 num_traces += secondary->trace.num_traces;
2984 u_trace_clone_append(u_trace_begin_iterator(&secondary->trace),
2985 u_trace_end_iterator(&secondary->trace),
2986 &container->trace,
2987 &memcpy_state,
2988 cmd_buffer_emit_copy_ts_buffer);
2989 }
2990 genX(emit_so_memcpy_fini)(&memcpy_state);
2991
2992 trace_intel_end_trace_copy(&container->trace, num_traces);
2993
2994 /* Memcpy is done using the 3D pipeline. */
2995 container->state.current_pipeline = _3D;
2996 }
2997 }
2998
2999 static inline enum anv_pipe_bits
anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3000 anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3001 VkAccessFlags2 flags)
3002 {
3003 enum anv_pipe_bits pipe_bits = 0;
3004
3005 u_foreach_bit64(b, flags) {
3006 switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3007 case VK_ACCESS_2_SHADER_WRITE_BIT:
3008 case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
3009 case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
3010 /* We're transitioning a buffer that was previously used as write
3011 * destination through the data port. To make its content available
3012 * to future operations, flush the hdc pipeline.
3013 */
3014 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3015 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3016 break;
3017 case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
3018 /* We're transitioning a buffer that was previously used as render
3019 * target. To make its content available to future operations, flush
3020 * the render target cache.
3021 */
3022 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3023 break;
3024 case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
3025 /* We're transitioning a buffer that was previously used as depth
3026 * buffer. To make its content available to future operations, flush
3027 * the depth cache.
3028 */
3029 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3030 break;
3031 case VK_ACCESS_2_TRANSFER_WRITE_BIT:
3032 /* We're transitioning a buffer that was previously used as a
3033 * transfer write destination. Generic write operations include color
3034 * & depth operations as well as buffer operations like :
3035 * - vkCmdClearColorImage()
3036 * - vkCmdClearDepthStencilImage()
3037 * - vkCmdBlitImage()
3038 * - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
3039 *
3040 * Most of these operations are implemented using Blorp which writes
3041 * through the render target cache or the depth cache on the graphics
3042 * queue. On the compute queue, the writes are done through the data
3043 * port.
3044 */
3045 if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
3046 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3047 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3048 } else {
3049 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
3050 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3051 }
3052 break;
3053 case VK_ACCESS_2_MEMORY_WRITE_BIT:
3054 /* We're transitioning a buffer for generic write operations. Flush
3055 * all the caches.
3056 */
3057 pipe_bits |= ANV_PIPE_FLUSH_BITS;
3058 break;
3059 case VK_ACCESS_2_HOST_WRITE_BIT:
3060 /* We're transitioning a buffer for access by CPU. Invalidate
3061 * all the caches. Since data and tile caches don't have invalidate,
3062 * we are forced to flush those as well.
3063 */
3064 pipe_bits |= ANV_PIPE_FLUSH_BITS;
3065 pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3066 break;
3067 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3068 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
3069 /* We're transitioning a buffer written either from VS stage or from
3070 * the command streamer (see CmdEndTransformFeedbackEXT), we just
3071 * need to stall the CS.
3072 *
3073 * Streamout writes apparently bypassing L3, in order to make them
3074 * visible to the destination, we need to invalidate the other
3075 * caches.
3076 */
3077 pipe_bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_INVALIDATE_BITS;
3078 break;
3079 default:
3080 break; /* Nothing to do */
3081 }
3082 }
3083
3084 return pipe_bits;
3085 }
3086
3087 static inline enum anv_pipe_bits
anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer * cmd_buffer,VkAccessFlags2 flags)3088 anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
3089 VkAccessFlags2 flags)
3090 {
3091 struct anv_device *device = cmd_buffer->device;
3092 enum anv_pipe_bits pipe_bits = 0;
3093
3094 u_foreach_bit64(b, flags) {
3095 switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
3096 case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
3097 /* Indirect draw commands take a buffer as input that we're going to
3098 * read from the command streamer to load some of the HW registers
3099 * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
3100 * command streamer stall so that all the cache flushes have
3101 * completed before the command streamer loads from memory.
3102 */
3103 pipe_bits |= ANV_PIPE_CS_STALL_BIT;
3104 /* Indirect draw commands also set gl_BaseVertex & gl_BaseIndex
3105 * through a vertex buffer, so invalidate that cache.
3106 */
3107 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3108 /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
3109 * UBO from the buffer, so we need to invalidate constant cache.
3110 */
3111 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3112 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3113 /* Tile cache flush needed For CmdDipatchIndirect since command
3114 * streamer and vertex fetch aren't L3 coherent.
3115 */
3116 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3117 break;
3118 case VK_ACCESS_2_INDEX_READ_BIT:
3119 case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
3120 /* We transitioning a buffer to be used for as input for vkCmdDraw*
3121 * commands, so we invalidate the VF cache to make sure there is no
3122 * stale data when we start rendering.
3123 */
3124 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
3125 break;
3126 case VK_ACCESS_2_UNIFORM_READ_BIT:
3127 case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
3128 /* We transitioning a buffer to be used as uniform data. Because
3129 * uniform is accessed through the data port & sampler, we need to
3130 * invalidate the texture cache (sampler) & constant cache (data
3131 * port) to avoid stale data.
3132 */
3133 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
3134 if (device->physical->compiler->indirect_ubos_use_sampler) {
3135 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3136 } else {
3137 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3138 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3139 }
3140 break;
3141 case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
3142 case VK_ACCESS_2_TRANSFER_READ_BIT:
3143 case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
3144 /* Transitioning a buffer to be read through the sampler, so
3145 * invalidate the texture cache, we don't want any stale data.
3146 */
3147 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3148 break;
3149 case VK_ACCESS_2_SHADER_READ_BIT:
3150 /* Same as VK_ACCESS_2_UNIFORM_READ_BIT and
3151 * VK_ACCESS_2_SHADER_SAMPLED_READ_BIT cases above
3152 */
3153 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
3154 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
3155 if (!device->physical->compiler->indirect_ubos_use_sampler) {
3156 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3157 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3158 }
3159 break;
3160 case VK_ACCESS_2_MEMORY_READ_BIT:
3161 /* Transitioning a buffer for generic read, invalidate all the
3162 * caches.
3163 */
3164 pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
3165 break;
3166 case VK_ACCESS_2_MEMORY_WRITE_BIT:
3167 /* Generic write, make sure all previously written things land in
3168 * memory.
3169 */
3170 pipe_bits |= ANV_PIPE_FLUSH_BITS;
3171 break;
3172 case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT:
3173 case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT:
3174 /* Transitioning a buffer for conditional rendering or transform
3175 * feedback. We'll load the content of this buffer into HW registers
3176 * using the command streamer, so we need to stall the command
3177 * streamer , so we need to stall the command streamer to make sure
3178 * any in-flight flush operations have completed.
3179 */
3180 pipe_bits |= ANV_PIPE_CS_STALL_BIT;
3181 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3182 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
3183 break;
3184 case VK_ACCESS_2_HOST_READ_BIT:
3185 /* We're transitioning a buffer that was written by CPU. Flush
3186 * all the caches.
3187 */
3188 pipe_bits |= ANV_PIPE_FLUSH_BITS;
3189 break;
3190 case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
3191 /* We're transitioning a buffer to be written by the streamout fixed
3192 * function. This one is apparently not L3 coherent, so we need a
3193 * tile cache flush to make sure any previous write is not going to
3194 * create WaW hazards.
3195 */
3196 pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
3197 break;
3198 case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
3199 /* VK_ACCESS_2_SHADER_STORAGE_READ_BIT specifies read access to a
3200 * storage buffer, physical storage buffer, storage texel buffer, or
3201 * storage image in any shader pipeline stage.
3202 *
3203 * Any storage buffers or images written to must be invalidated and
3204 * flushed before the shader can access them.
3205 *
3206 * Both HDC & Untyped flushes also do invalidation. This is why we use
3207 * this here on Gfx12+.
3208 *
3209 * Gfx11 and prior don't have HDC. Only Data cache flush is available
3210 * and it only operates on the written cache lines.
3211 */
3212 if (device->info->ver >= 12) {
3213 pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3214 pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3215 }
3216 break;
3217 default:
3218 break; /* Nothing to do */
3219 }
3220 }
3221
3222 return pipe_bits;
3223 }
3224
3225 static inline bool
stage_is_shader(const VkPipelineStageFlags2 stage)3226 stage_is_shader(const VkPipelineStageFlags2 stage)
3227 {
3228 return (stage & (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
3229 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
3230 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
3231 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
3232 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
3233 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
3234 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
3235 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3236 VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
3237 VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
3238 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT));
3239 }
3240
3241 static inline bool
stage_is_transfer(const VkPipelineStageFlags2 stage)3242 stage_is_transfer(const VkPipelineStageFlags2 stage)
3243 {
3244 return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3245 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT));
3246 }
3247
3248 static inline bool
stage_is_video(const VkPipelineStageFlags2 stage)3249 stage_is_video(const VkPipelineStageFlags2 stage)
3250 {
3251 return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
3252 #ifdef VK_ENABLE_BETA_EXTENSIONS
3253 VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR |
3254 #endif
3255 VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR));
3256 }
3257
3258 static inline bool
mask_is_shader_write(const VkAccessFlags2 access)3259 mask_is_shader_write(const VkAccessFlags2 access)
3260 {
3261 return (access & (VK_ACCESS_2_SHADER_WRITE_BIT |
3262 VK_ACCESS_2_MEMORY_WRITE_BIT |
3263 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT));
3264 }
3265
3266 static inline bool
mask_is_write(const VkAccessFlags2 access)3267 mask_is_write(const VkAccessFlags2 access)
3268 {
3269 return access & (VK_ACCESS_2_SHADER_WRITE_BIT |
3270 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
3271 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
3272 VK_ACCESS_2_TRANSFER_WRITE_BIT |
3273 VK_ACCESS_2_HOST_WRITE_BIT |
3274 VK_ACCESS_2_MEMORY_WRITE_BIT |
3275 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
3276 VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR |
3277 #ifdef VK_ENABLE_BETA_EXTENSIONS
3278 VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR |
3279 #endif
3280 VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
3281 VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
3282 VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV |
3283 VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR |
3284 VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT |
3285 VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV);
3286 }
3287
3288 static inline bool
mask_is_transfer_write(const VkAccessFlags2 access)3289 mask_is_transfer_write(const VkAccessFlags2 access)
3290 {
3291 return access & (VK_ACCESS_2_TRANSFER_WRITE_BIT |
3292 VK_ACCESS_2_MEMORY_WRITE_BIT);
3293 }
3294
3295 static void
cmd_buffer_barrier_video(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info)3296 cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
3297 const VkDependencyInfo *dep_info)
3298 {
3299 assert(anv_cmd_buffer_is_video_queue(cmd_buffer));
3300
3301 bool flush_llc = false;
3302 bool flush_ccs = false;
3303 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
3304 const VkImageMemoryBarrier2 *img_barrier =
3305 &dep_info->pImageMemoryBarriers[i];
3306
3307 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
3308 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
3309
3310 /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
3311 * memory barrier defines a queue family ownership transfer.
3312 */
3313 if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
3314 flush_llc = true;
3315
3316 VkImageAspectFlags img_aspects =
3317 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
3318 anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
3319 const uint32_t plane =
3320 anv_image_aspect_to_plane(image, 1UL << aspect_bit);
3321 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
3322 flush_ccs = true;
3323 }
3324 }
3325 }
3326
3327 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
3328 /* Flush the cache if something is written by the video operations and
3329 * used by any other stages except video encode/decode stages or if
3330 * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
3331 * barrier defines a queue family ownership transfer.
3332 */
3333 if ((stage_is_video(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
3334 mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask) &&
3335 !stage_is_video(dep_info->pBufferMemoryBarriers[i].dstStageMask)) ||
3336 (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
3337 dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
3338 flush_llc = true;
3339 break;
3340 }
3341 }
3342
3343 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
3344 /* Flush the cache if something is written by the video operations and
3345 * used by any other stages except video encode/decode stage.
3346 */
3347 if (stage_is_video(dep_info->pMemoryBarriers[i].srcStageMask) &&
3348 mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
3349 !stage_is_video(dep_info->pMemoryBarriers[i].dstStageMask)) {
3350 flush_llc = true;
3351 break;
3352 }
3353 }
3354
3355 if (flush_ccs || flush_llc) {
3356 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
3357 #if GFX_VERx10 >= 125
3358 fd.FlushCCS = flush_ccs;
3359 #endif
3360 #if GFX_VER >= 12
3361 /* Using this bit on Gfx9 triggers a GPU hang.
3362 * This is undocumented behavior. Gfx12 seems fine.
3363 * TODO: check Gfx11
3364 */
3365 fd.FlushLLC = flush_llc;
3366 #endif
3367 }
3368 }
3369 }
3370
3371 static void
cmd_buffer_barrier_blitter(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info)3372 cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
3373 const VkDependencyInfo *dep_info)
3374 {
3375 #if GFX_VERx10 >= 125
3376 assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
3377
3378 /* The blitter requires an MI_FLUSH_DW command when a buffer transitions
3379 * from being a destination to a source.
3380 */
3381 bool flush_llc = false;
3382 bool flush_ccs = false;
3383 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
3384 const VkImageMemoryBarrier2 *img_barrier =
3385 &dep_info->pImageMemoryBarriers[i];
3386
3387 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
3388 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
3389
3390 /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
3391 * memory barrier defines a queue family transfer operation.
3392 */
3393 if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
3394 flush_llc = true;
3395
3396 /* Flush cache if transfer command reads the output of the previous
3397 * transfer command, ideally we should just wait for the completion but
3398 * for now just flush the cache to make the data visible.
3399 */
3400 if ((img_barrier->oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ||
3401 img_barrier->oldLayout == VK_IMAGE_LAYOUT_GENERAL) &&
3402 (img_barrier->newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL ||
3403 img_barrier->newLayout == VK_IMAGE_LAYOUT_GENERAL)) {
3404 flush_llc = true;
3405 }
3406
3407 VkImageAspectFlags img_aspects =
3408 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
3409 anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
3410 const uint32_t plane =
3411 anv_image_aspect_to_plane(image, 1UL << aspect_bit);
3412 if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
3413 flush_ccs = true;
3414 }
3415 }
3416 }
3417
3418 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
3419 /* Flush the cache if something is written by the transfer command and
3420 * used by any other stages except transfer stage or if
3421 * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
3422 * barrier defines a queue family transfer operation.
3423 */
3424 if ((stage_is_transfer(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
3425 mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask)) ||
3426 (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
3427 dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
3428 flush_llc = true;
3429 break;
3430 }
3431 }
3432
3433 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
3434 /* Flush the cache if something is written by the transfer command and
3435 * used by any other stages except transfer stage.
3436 */
3437 if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
3438 mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask)) {
3439 flush_llc = true;
3440 break;
3441 }
3442 }
3443
3444 if (flush_ccs || flush_llc) {
3445 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
3446 if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
3447 genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
3448 cmd_buffer->device);
3449 }
3450 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
3451 fd.FlushCCS = flush_ccs;
3452 fd.FlushLLC = flush_llc;
3453 }
3454 }
3455 #endif
3456 }
3457
3458 static inline bool
cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer * cmd_buffer)3459 cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer *cmd_buffer)
3460 {
3461 /* Query copies are only written with dataport, so we only need to check
3462 * that flag.
3463 */
3464 return (cmd_buffer->state.queries.buffer_write_bits &
3465 ANV_QUERY_WRITES_DATA_FLUSH) != 0;
3466 }
3467
3468 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,const char * reason)3469 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
3470 const VkDependencyInfo *dep_info,
3471 const char *reason)
3472 {
3473 if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
3474 cmd_buffer_barrier_video(cmd_buffer, dep_info);
3475 return;
3476 }
3477
3478 if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
3479 cmd_buffer_barrier_blitter(cmd_buffer, dep_info);
3480 return;
3481 }
3482
3483 struct anv_device *device = cmd_buffer->device;
3484
3485 /* XXX: Right now, we're really dumb and just flush whatever categories
3486 * the app asks for. One of these days we may make this a bit better
3487 * but right now that's all the hardware allows for in most areas.
3488 */
3489 VkAccessFlags2 src_flags = 0;
3490 VkAccessFlags2 dst_flags = 0;
3491
3492 bool apply_sparse_flushes = false;
3493 bool flush_query_copies = false;
3494
3495 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
3496 src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
3497 dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
3498
3499 /* Shader writes to buffers that could then be written by a transfer
3500 * command (including queries).
3501 */
3502 if (stage_is_shader(dep_info->pMemoryBarriers[i].srcStageMask) &&
3503 mask_is_shader_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
3504 stage_is_transfer(dep_info->pMemoryBarriers[i].dstStageMask)) {
3505 cmd_buffer->state.queries.buffer_write_bits |=
3506 ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
3507 }
3508
3509 if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
3510 mask_is_transfer_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
3511 cmd_buffer_has_pending_copy_query(cmd_buffer))
3512 flush_query_copies = true;
3513
3514 /* There's no way of knowing if this memory barrier is related to sparse
3515 * buffers! This is pretty horrible.
3516 */
3517 if (device->using_sparse && mask_is_write(src_flags))
3518 apply_sparse_flushes = true;
3519 }
3520
3521 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
3522 const VkBufferMemoryBarrier2 *buf_barrier =
3523 &dep_info->pBufferMemoryBarriers[i];
3524 ANV_FROM_HANDLE(anv_buffer, buffer, buf_barrier->buffer);
3525
3526 src_flags |= buf_barrier->srcAccessMask;
3527 dst_flags |= buf_barrier->dstAccessMask;
3528
3529 /* Shader writes to buffers that could then be written by a transfer
3530 * command (including queries).
3531 */
3532 if (stage_is_shader(buf_barrier->srcStageMask) &&
3533 mask_is_shader_write(buf_barrier->srcAccessMask) &&
3534 stage_is_transfer(buf_barrier->dstStageMask)) {
3535 cmd_buffer->state.queries.buffer_write_bits |=
3536 ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
3537 }
3538
3539 if (stage_is_transfer(buf_barrier->srcStageMask) &&
3540 mask_is_transfer_write(buf_barrier->srcAccessMask) &&
3541 cmd_buffer_has_pending_copy_query(cmd_buffer))
3542 flush_query_copies = true;
3543
3544 if (anv_buffer_is_sparse(buffer) && mask_is_write(src_flags))
3545 apply_sparse_flushes = true;
3546 }
3547
3548 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
3549 const VkImageMemoryBarrier2 *img_barrier =
3550 &dep_info->pImageMemoryBarriers[i];
3551
3552 src_flags |= img_barrier->srcAccessMask;
3553 dst_flags |= img_barrier->dstAccessMask;
3554
3555 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
3556 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
3557
3558 uint32_t base_layer, layer_count;
3559 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
3560 base_layer = 0;
3561 layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
3562 } else {
3563 base_layer = range->baseArrayLayer;
3564 layer_count = vk_image_subresource_layer_count(&image->vk, range);
3565 }
3566 const uint32_t level_count =
3567 vk_image_subresource_level_count(&image->vk, range);
3568
3569 VkImageLayout old_layout = img_barrier->oldLayout;
3570 VkImageLayout new_layout = img_barrier->newLayout;
3571
3572 /* If we're inside a render pass, the runtime might have converted some
3573 * layouts from GENERAL to FEEDBACK_LOOP. Check if that's the case and
3574 * reconvert back to the original layout so that application barriers
3575 * within renderpass are operating with consistent layouts.
3576 */
3577 if (!cmd_buffer->vk.runtime_rp_barrier &&
3578 cmd_buffer->vk.render_pass != NULL) {
3579 assert(anv_cmd_graphics_state_has_image_as_attachment(&cmd_buffer->state.gfx,
3580 image));
3581 VkImageLayout subpass_att_layout, subpass_stencil_att_layout;
3582
3583 vk_command_buffer_get_attachment_layout(
3584 &cmd_buffer->vk, &image->vk,
3585 &subpass_att_layout, &subpass_stencil_att_layout);
3586
3587 old_layout = subpass_att_layout;
3588 new_layout = subpass_att_layout;
3589 }
3590
3591 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3592 transition_depth_buffer(cmd_buffer, image,
3593 base_layer, layer_count,
3594 old_layout, new_layout,
3595 false /* will_full_fast_clear */);
3596 }
3597
3598 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3599 transition_stencil_buffer(cmd_buffer, image,
3600 range->baseMipLevel, level_count,
3601 base_layer, layer_count,
3602 old_layout, new_layout,
3603 false /* will_full_fast_clear */);
3604 }
3605
3606 if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
3607 VkImageAspectFlags color_aspects =
3608 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
3609 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
3610 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
3611 range->baseMipLevel, level_count,
3612 base_layer, layer_count,
3613 old_layout, new_layout,
3614 img_barrier->srcQueueFamilyIndex,
3615 img_barrier->dstQueueFamilyIndex,
3616 false /* will_full_fast_clear */);
3617 }
3618 }
3619
3620 /* Mark image as compressed if the destination layout has untracked
3621 * writes to the aux surface.
3622 */
3623 VkImageAspectFlags aspects =
3624 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
3625 anv_foreach_image_aspect_bit(aspect_bit, image, aspects) {
3626 VkImageAspectFlagBits aspect = 1UL << aspect_bit;
3627 if (anv_layout_has_untracked_aux_writes(
3628 device->info,
3629 image, aspect,
3630 img_barrier->newLayout,
3631 cmd_buffer->queue_family->queueFlags)) {
3632 for (uint32_t l = 0; l < level_count; l++) {
3633 set_image_compressed_bit(cmd_buffer, image, aspect,
3634 range->baseMipLevel + l,
3635 base_layer, layer_count,
3636 true);
3637 }
3638 }
3639 }
3640
3641 if (anv_image_is_sparse(image) && mask_is_write(src_flags))
3642 apply_sparse_flushes = true;
3643 }
3644
3645 enum anv_pipe_bits bits =
3646 anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags) |
3647 anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags);
3648
3649 /* Our HW implementation of the sparse feature lives in the GAM unit
3650 * (interface between all the GPU caches and external memory). As a result
3651 * writes to NULL bound images & buffers that should be ignored are
3652 * actually still visible in the caches. The only way for us to get correct
3653 * NULL bound regions to return 0s is to evict the caches to force the
3654 * caches to be repopulated with 0s.
3655 */
3656 if (apply_sparse_flushes)
3657 bits |= ANV_PIPE_FLUSH_BITS;
3658
3659 /* Copies from query pools are executed with a shader writing through the
3660 * dataport.
3661 */
3662 if (flush_query_copies) {
3663 bits |= (GFX_VER >= 12 ?
3664 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
3665 }
3666
3667 if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
3668 genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
3669
3670 anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
3671 }
3672
genX(CmdPipelineBarrier2)3673 void genX(CmdPipelineBarrier2)(
3674 VkCommandBuffer commandBuffer,
3675 const VkDependencyInfo* pDependencyInfo)
3676 {
3677 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3678
3679 cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
3680 }
3681
3682 void
genX(batch_emit_breakpoint)3683 genX(batch_emit_breakpoint)(struct anv_batch *batch,
3684 struct anv_device *device,
3685 bool emit_before_draw)
3686 {
3687 /* Update draw call count once */
3688 uint32_t draw_count = emit_before_draw ?
3689 p_atomic_inc_return(&device->draw_call_count) :
3690 p_atomic_read(&device->draw_call_count);
3691
3692 if (((draw_count == intel_debug_bkp_before_draw_count &&
3693 emit_before_draw) ||
3694 (draw_count == intel_debug_bkp_after_draw_count &&
3695 !emit_before_draw))) {
3696 struct anv_address wait_addr =
3697 anv_state_pool_state_address(&device->dynamic_state_pool,
3698 device->breakpoint);
3699
3700 anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
3701 sem.WaitMode = PollingMode;
3702 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
3703 sem.SemaphoreDataDword = 0x1;
3704 sem.SemaphoreAddress = wait_addr;
3705 };
3706 }
3707 }
3708
3709 /* Only emit PIPELINE_SELECT, for the whole mode switch and flushing use
3710 * flush_pipeline_select()
3711 */
3712 void
genX(emit_pipeline_select)3713 genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
3714 const struct anv_device *device)
3715 {
3716 /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
3717 #if GFX_VER < 20
3718 anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) {
3719 ps.MaskBits = GFX_VERx10 >= 125 ? 0x93 : GFX_VER >= 12 ? 0x13 : 0x3;
3720 #if GFX_VER == 12
3721 ps.MediaSamplerDOPClockGateEnable = true;
3722 #endif
3723 ps.PipelineSelection = pipeline;
3724 #if GFX_VERx10 == 125
3725 /* It might still be better to only enable this when the compute
3726 * pipeline will have DPAS instructions.
3727 */
3728 ps.SystolicModeEnable = pipeline == GPGPU &&
3729 device->vk.enabled_extensions.KHR_cooperative_matrix &&
3730 device->vk.enabled_features.cooperativeMatrix;
3731 #endif
3732 }
3733 #endif /* if GFX_VER < 20 */
3734 }
3735
3736 static void
genX(flush_pipeline_select)3737 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
3738 uint32_t pipeline)
3739 {
3740 UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
3741
3742 if (cmd_buffer->state.current_pipeline == pipeline)
3743 return;
3744
3745 #if GFX_VER >= 20
3746 /* Since we are not stalling/flushing caches explicitly while switching
3747 * between the pipelines, we need to apply data dependency flushes recorded
3748 * previously on the resource.
3749 */
3750 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3751 #else
3752
3753 #if GFX_VER == 9
3754 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
3755 *
3756 * Software must clear the COLOR_CALC_STATE Valid field in
3757 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
3758 * with Pipeline Select set to GPGPU.
3759 *
3760 * The internal hardware docs recommend the same workaround for Gfx9
3761 * hardware too.
3762 */
3763 if (pipeline == GPGPU)
3764 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
3765 #endif
3766
3767 #if GFX_VERx10 == 120
3768 /* Undocumented workaround to force the re-emission of
3769 * MEDIA_INTERFACE_DESCRIPTOR_LOAD when switching from 3D to Compute
3770 * pipeline without rebinding a pipeline :
3771 * vkCmdBindPipeline(COMPUTE, cs_pipeline);
3772 * vkCmdDispatch(...);
3773 * vkCmdBindPipeline(GRAPHICS, gfx_pipeline);
3774 * vkCmdDraw(...);
3775 * vkCmdDispatch(...);
3776 */
3777 if (pipeline == _3D)
3778 cmd_buffer->state.compute.pipeline_dirty = true;
3779 #endif
3780
3781 /* We apparently cannot flush the tile cache (color/depth) from the GPGPU
3782 * pipeline. That means query clears will not be visible to query
3783 * copy/write. So we need to flush it before going to GPGPU mode.
3784 */
3785 if (cmd_buffer->state.current_pipeline == _3D &&
3786 cmd_buffer->state.queries.clear_bits) {
3787 anv_add_pending_pipe_bits(cmd_buffer,
3788 ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
3789 "query clear flush prior to GPGPU");
3790 }
3791
3792 /* Flush and invalidate bits done needed prior PIPELINE_SELECT. */
3793 enum anv_pipe_bits bits = 0;
3794
3795 #if GFX_VER >= 12
3796 /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
3797 *
3798 * "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
3799 * are flushed through a stalling PIPE_CONTROL command prior to
3800 * programming of PIPELINE_SELECT command transitioning Pipeline Select
3801 * from 3D to GPGPU/Media.
3802 * Software must ensure HDC Pipeline flush and Generic Media State Clear
3803 * is issued through a stalling PIPE_CONTROL command prior to programming
3804 * of PIPELINE_SELECT command transitioning Pipeline Select from
3805 * GPGPU/Media to 3D."
3806 *
3807 * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
3808 * because PIPE was not in MEDIA mode?!
3809 */
3810 bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
3811
3812 if (cmd_buffer->state.current_pipeline == _3D) {
3813 bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3814 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
3815 } else {
3816 bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3817 }
3818 #else
3819 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
3820 * PIPELINE_SELECT [DevBWR+]":
3821 *
3822 * Project: DEVSNB+
3823 *
3824 * Software must ensure all the write caches are flushed through a
3825 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
3826 * command to invalidate read only caches prior to programming
3827 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
3828 *
3829 * Note the cmd_buffer_apply_pipe_flushes will split this into two
3830 * PIPE_CONTROLs.
3831 */
3832 bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
3833 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
3834 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
3835 ANV_PIPE_CS_STALL_BIT |
3836 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
3837 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
3838 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
3839 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
3840 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
3841 #endif
3842
3843 /* Wa_16013063087 - State Cache Invalidate must be issued prior to
3844 * PIPELINE_SELECT when switching from 3D to Compute.
3845 *
3846 * SW must do this by programming of PIPECONTROL with “CS Stall” followed by
3847 * a PIPECONTROL with State Cache Invalidate bit set.
3848 *
3849 */
3850 if (cmd_buffer->state.current_pipeline == _3D && pipeline == GPGPU &&
3851 intel_needs_workaround(cmd_buffer->device->info, 16013063087))
3852 bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
3853
3854 anv_add_pending_pipe_bits(cmd_buffer, bits, "flush/invalidate PIPELINE_SELECT");
3855 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3856
3857 #if GFX_VER == 9
3858 if (pipeline == _3D) {
3859 /* There is a mid-object preemption workaround which requires you to
3860 * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However,
3861 * even without preemption, we have issues with geometry flickering when
3862 * GPGPU and 3D are back-to-back and this seems to fix it. We don't
3863 * really know why.
3864 *
3865 * Also, from the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
3866 *
3867 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
3868 * the only bits that are changed are scoreboard related ..."
3869 *
3870 * This is satisfied by applying pre-PIPELINE_SELECT pipe flushes above.
3871 */
3872 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
3873 vfe.MaximumNumberofThreads =
3874 devinfo->max_cs_threads * devinfo->subslice_total - 1;
3875 vfe.NumberofURBEntries = 2;
3876 vfe.URBEntryAllocationSize = 2;
3877 }
3878
3879 /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
3880 * invalid. Set the compute pipeline to dirty to force a re-emit of the
3881 * pipeline in case we get back-to-back dispatch calls with the same
3882 * pipeline and a PIPELINE_SELECT in between.
3883 */
3884 cmd_buffer->state.compute.pipeline_dirty = true;
3885 }
3886 #endif
3887
3888 genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline, cmd_buffer->device);
3889
3890 #if GFX_VER == 9
3891 if (devinfo->platform == INTEL_PLATFORM_GLK) {
3892 /* Project: DevGLK
3893 *
3894 * "This chicken bit works around a hardware issue with barrier logic
3895 * encountered when switching between GPGPU and 3D pipelines. To
3896 * workaround the issue, this mode bit should be set after a pipeline
3897 * is selected."
3898 */
3899 anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
3900 scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
3901 : GLK_BARRIER_MODE_3D_HULL;
3902 scec1.GLKBarrierModeMask = 1;
3903 }
3904 }
3905 #endif
3906 #endif /* else of if GFX_VER >= 20 */
3907 cmd_buffer->state.current_pipeline = pipeline;
3908 }
3909
3910 void
genX(flush_pipeline_select_3d)3911 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
3912 {
3913 genX(flush_pipeline_select)(cmd_buffer, _3D);
3914 }
3915
3916 void
genX(flush_pipeline_select_gpgpu)3917 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
3918 {
3919 genX(flush_pipeline_select)(cmd_buffer, GPGPU);
3920 }
3921
3922 void
genX(cmd_buffer_emit_gfx12_depth_wa)3923 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
3924 const struct isl_surf *surf)
3925 {
3926 #if INTEL_NEEDS_WA_1808121037
3927 const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
3928 surf->samples == 1;
3929
3930 switch (cmd_buffer->state.depth_reg_mode) {
3931 case ANV_DEPTH_REG_MODE_HW_DEFAULT:
3932 if (!is_d16_1x_msaa)
3933 return;
3934 break;
3935 case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
3936 if (is_d16_1x_msaa)
3937 return;
3938 break;
3939 case ANV_DEPTH_REG_MODE_UNKNOWN:
3940 break;
3941 }
3942
3943 /* We'll change some CHICKEN registers depending on the depth surface
3944 * format. Do a depth flush and stall so the pipeline is not using these
3945 * settings while we change the registers.
3946 */
3947 anv_add_pending_pipe_bits(cmd_buffer,
3948 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
3949 ANV_PIPE_DEPTH_STALL_BIT |
3950 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
3951 "Workaround: Stop pipeline for 1808121037");
3952 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3953
3954 /* Wa_1808121037
3955 *
3956 * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
3957 * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
3958 */
3959 anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
3960 reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
3961 reg.HIZPlaneOptimizationdisablebitMask = true;
3962 }
3963
3964 cmd_buffer->state.depth_reg_mode =
3965 is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
3966 ANV_DEPTH_REG_MODE_HW_DEFAULT;
3967 #endif
3968 }
3969
3970 #if GFX_VER == 9
3971 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
3972 *
3973 * "The VF cache needs to be invalidated before binding and then using
3974 * Vertex Buffers that overlap with any previously bound Vertex Buffer
3975 * (at a 64B granularity) since the last invalidation. A VF cache
3976 * invalidate is performed by setting the "VF Cache Invalidation Enable"
3977 * bit in PIPE_CONTROL."
3978 *
3979 * This is implemented by carefully tracking all vertex and index buffer
3980 * bindings and flushing if the cache ever ends up with a range in the cache
3981 * that would exceed 4 GiB. This is implemented in three parts:
3982 *
3983 * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
3984 * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
3985 * tracking code of the new binding. If this new binding would cause
3986 * the cache to have a too-large range on the next draw call, a pipeline
3987 * stall and VF cache invalidate are added to pending_pipeline_bits.
3988 *
3989 * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
3990 * empty whenever we emit a VF invalidate.
3991 *
3992 * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
3993 * after every 3DPRIMITIVE and copies the bound range into the dirty
3994 * range for each used buffer. This has to be a separate step because
3995 * we don't always re-bind all buffers and so 1. can't know which
3996 * buffers are actually bound.
3997 */
3998 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)3999 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4000 int vb_index,
4001 struct anv_address vb_address,
4002 uint32_t vb_size)
4003 {
4004 if (GFX_VER > 9)
4005 return;
4006
4007 struct anv_vb_cache_range *bound, *dirty;
4008 if (vb_index == -1) {
4009 bound = &cmd_buffer->state.gfx.ib_bound_range;
4010 dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4011 } else {
4012 assert(vb_index >= 0);
4013 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4014 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4015 bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4016 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4017 }
4018
4019 if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4020 vb_address,
4021 vb_size)) {
4022 anv_add_pending_pipe_bits(cmd_buffer,
4023 ANV_PIPE_CS_STALL_BIT |
4024 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4025 "vb > 32b range");
4026 }
4027 }
4028
4029 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4030 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4031 uint32_t access_type,
4032 uint64_t vb_used)
4033 {
4034 if (access_type == RANDOM) {
4035 /* We have an index buffer */
4036 struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4037 struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4038
4039 anv_merge_vb_cache_range(dirty, bound);
4040 }
4041
4042 uint64_t mask = vb_used;
4043 while (mask) {
4044 int i = u_bit_scan64(&mask);
4045 assert(i >= 0);
4046 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4047 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4048
4049 struct anv_vb_cache_range *bound, *dirty;
4050 bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4051 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4052
4053 anv_merge_vb_cache_range(dirty, bound);
4054 }
4055 }
4056 #endif /* GFX_VER == 9 */
4057
4058 /**
4059 * Update the pixel hashing modes that determine the balancing of PS threads
4060 * across subslices and slices.
4061 *
4062 * \param width Width bound of the rendering area (already scaled down if \p
4063 * scale is greater than 1).
4064 * \param height Height bound of the rendering area (already scaled down if \p
4065 * scale is greater than 1).
4066 * \param scale The number of framebuffer samples that could potentially be
4067 * affected by an individual channel of the PS thread. This is
4068 * typically one for single-sampled rendering, but for operations
4069 * like CCS resolves and fast clears a single PS invocation may
4070 * update a huge number of pixels, in which case a finer
4071 * balancing is desirable in order to maximally utilize the
4072 * bandwidth available. UINT_MAX can be used as shorthand for
4073 * "finest hashing mode available".
4074 */
4075 void
genX(cmd_buffer_emit_hashing_mode)4076 genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
4077 unsigned width, unsigned height,
4078 unsigned scale)
4079 {
4080 #if GFX_VER == 9
4081 const struct intel_device_info *devinfo = cmd_buffer->device->info;
4082 const unsigned slice_hashing[] = {
4083 /* Because all Gfx9 platforms with more than one slice require
4084 * three-way subslice hashing, a single "normal" 16x16 slice hashing
4085 * block is guaranteed to suffer from substantial imbalance, with one
4086 * subslice receiving twice as much work as the other two in the
4087 * slice.
4088 *
4089 * The performance impact of that would be particularly severe when
4090 * three-way hashing is also in use for slice balancing (which is the
4091 * case for all Gfx9 GT4 platforms), because one of the slices
4092 * receives one every three 16x16 blocks in either direction, which
4093 * is roughly the periodicity of the underlying subslice imbalance
4094 * pattern ("roughly" because in reality the hardware's
4095 * implementation of three-way hashing doesn't do exact modulo 3
4096 * arithmetic, which somewhat decreases the magnitude of this effect
4097 * in practice). This leads to a systematic subslice imbalance
4098 * within that slice regardless of the size of the primitive. The
4099 * 32x32 hashing mode guarantees that the subslice imbalance within a
4100 * single slice hashing block is minimal, largely eliminating this
4101 * effect.
4102 */
4103 _32x32,
4104 /* Finest slice hashing mode available. */
4105 NORMAL
4106 };
4107 const unsigned subslice_hashing[] = {
4108 /* 16x16 would provide a slight cache locality benefit especially
4109 * visible in the sampler L1 cache efficiency of low-bandwidth
4110 * non-LLC platforms, but it comes at the cost of greater subslice
4111 * imbalance for primitives of dimensions approximately intermediate
4112 * between 16x4 and 16x16.
4113 */
4114 _16x4,
4115 /* Finest subslice hashing mode available. */
4116 _8x4
4117 };
4118 /* Dimensions of the smallest hashing block of a given hashing mode. If
4119 * the rendering area is smaller than this there can't possibly be any
4120 * benefit from switching to this mode, so we optimize out the
4121 * transition.
4122 */
4123 const unsigned min_size[][2] = {
4124 { 16, 4 },
4125 { 8, 4 }
4126 };
4127 const unsigned idx = scale > 1;
4128
4129 if (cmd_buffer->state.current_hash_scale != scale &&
4130 (width > min_size[idx][0] || height > min_size[idx][1])) {
4131 anv_add_pending_pipe_bits(cmd_buffer,
4132 ANV_PIPE_CS_STALL_BIT |
4133 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
4134 "change pixel hash mode");
4135 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4136
4137 anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
4138 gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
4139 gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
4140 gt.SubsliceHashing = subslice_hashing[idx];
4141 gt.SubsliceHashingMask = -1;
4142 }
4143
4144 cmd_buffer->state.current_hash_scale = scale;
4145 }
4146 #endif
4147 }
4148
4149 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)4150 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
4151 {
4152 struct anv_device *device = cmd_buffer->device;
4153 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4154
4155 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4156 device->isl_dev.ds.size / 4);
4157 if (dw == NULL)
4158 return;
4159
4160 struct isl_view isl_view = {};
4161 struct isl_depth_stencil_hiz_emit_info info = {
4162 .view = &isl_view,
4163 .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
4164 };
4165
4166 if (gfx->depth_att.iview != NULL) {
4167 isl_view = gfx->depth_att.iview->planes[0].isl;
4168 } else if (gfx->stencil_att.iview != NULL) {
4169 isl_view = gfx->stencil_att.iview->planes[0].isl;
4170 }
4171
4172 if (gfx->view_mask) {
4173 assert(isl_view.array_len == 0 ||
4174 isl_view.array_len >= util_last_bit(gfx->view_mask));
4175 isl_view.array_len = util_last_bit(gfx->view_mask);
4176 } else {
4177 assert(isl_view.array_len == 0 ||
4178 isl_view.array_len >= util_last_bit(gfx->layer_count));
4179 isl_view.array_len = gfx->layer_count;
4180 }
4181
4182 if (gfx->depth_att.iview != NULL) {
4183 const struct anv_image_view *iview = gfx->depth_att.iview;
4184 const struct anv_image *image = iview->image;
4185
4186 const uint32_t depth_plane =
4187 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
4188 const struct anv_surface *depth_surface =
4189 &image->planes[depth_plane].primary_surface;
4190 const struct anv_address depth_address =
4191 anv_image_address(image, &depth_surface->memory_range);
4192
4193 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, depth_address.bo);
4194
4195 info.depth_surf = &depth_surface->isl;
4196 info.depth_address = anv_address_physical(depth_address);
4197 info.mocs =
4198 anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
4199
4200 info.hiz_usage = gfx->depth_att.aux_usage;
4201 if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
4202 assert(isl_aux_usage_has_hiz(info.hiz_usage));
4203
4204 const struct anv_surface *hiz_surface =
4205 &image->planes[depth_plane].aux_surface;
4206 const struct anv_address hiz_address =
4207 anv_image_address(image, &hiz_surface->memory_range);
4208
4209 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, hiz_address.bo);
4210
4211 info.hiz_surf = &hiz_surface->isl;
4212 info.hiz_address = anv_address_physical(hiz_address);
4213
4214 info.depth_clear_value = ANV_HZ_FC_VAL;
4215 }
4216 }
4217
4218 if (gfx->stencil_att.iview != NULL) {
4219 const struct anv_image_view *iview = gfx->stencil_att.iview;
4220 const struct anv_image *image = iview->image;
4221
4222 const uint32_t stencil_plane =
4223 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
4224 const struct anv_surface *stencil_surface =
4225 &image->planes[stencil_plane].primary_surface;
4226 const struct anv_address stencil_address =
4227 anv_image_address(image, &stencil_surface->memory_range);
4228
4229 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, stencil_address.bo);
4230
4231 info.stencil_surf = &stencil_surface->isl;
4232
4233 info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
4234 info.stencil_address = anv_address_physical(stencil_address);
4235 info.mocs =
4236 anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
4237 }
4238
4239 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
4240
4241 /* Wa_14016712196:
4242 * Emit depth flush after state that sends implicit depth flush.
4243 */
4244 if (intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
4245 genx_batch_emit_pipe_control(&cmd_buffer->batch,
4246 cmd_buffer->device->info,
4247 cmd_buffer->state.current_pipeline,
4248 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT);
4249 }
4250
4251 if (info.depth_surf)
4252 genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
4253
4254 if (GFX_VER >= 11) {
4255 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
4256 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4257
4258 if (intel_needs_workaround(cmd_buffer->device->info, 1408224581) ||
4259 intel_needs_workaround(cmd_buffer->device->info, 14014097488)) {
4260 /* Wa_1408224581
4261 *
4262 * Workaround: Gfx12LP Astep only An additional pipe control with
4263 * post-sync = store dword operation would be required.( w/a is to
4264 * have an additional pipe control after the stencil state whenever
4265 * the surface state bits of this state is changing).
4266 *
4267 * This also seems sufficient to handle Wa_14014097488.
4268 */
4269 genx_batch_emit_pipe_control_write
4270 (&cmd_buffer->batch, cmd_buffer->device->info,
4271 cmd_buffer->state.current_pipeline, WriteImmediateData,
4272 cmd_buffer->device->workaround_address, 0, 0);
4273 }
4274 }
4275 cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
4276 }
4277
4278 static void
cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image_view * fsr_iview)4279 cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
4280 const struct anv_image_view *fsr_iview)
4281 {
4282 #if GFX_VERx10 >= 125
4283 struct anv_device *device = cmd_buffer->device;
4284
4285 if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
4286 return;
4287
4288 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4289 device->isl_dev.cpb.size / 4);
4290 if (dw == NULL)
4291 return;
4292
4293 struct isl_cpb_emit_info info = { };
4294
4295 if (fsr_iview) {
4296 const struct anv_image_binding *binding = &fsr_iview->image->bindings[0];
4297
4298 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, binding->address.bo);
4299
4300 struct anv_address addr =
4301 anv_address_add(binding->address, binding->memory_range.offset);
4302
4303 info.view = &fsr_iview->planes[0].isl;
4304 info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
4305 info.address = anv_address_physical(addr);
4306 info.mocs =
4307 anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
4308 ISL_SURF_USAGE_CPB_BIT);
4309 }
4310
4311 isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
4312
4313 /* Wa_14016712196:
4314 * Emit depth flush after state that sends implicit depth flush.
4315 */
4316 if (intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
4317 genx_batch_emit_pipe_control(&cmd_buffer->batch,
4318 cmd_buffer->device->info,
4319 cmd_buffer->state.current_pipeline,
4320 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT);
4321 }
4322 #endif /* GFX_VERx10 >= 125 */
4323 }
4324
4325 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)4326 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
4327 {
4328 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
4329 vk_find_struct_const(att->pNext,
4330 RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
4331 if (layout_info != NULL)
4332 return layout_info->initialLayout;
4333
4334 return att->imageLayout;
4335 }
4336
genX(CmdBeginRendering)4337 void genX(CmdBeginRendering)(
4338 VkCommandBuffer commandBuffer,
4339 const VkRenderingInfo* pRenderingInfo)
4340 {
4341 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4342 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4343 VkResult result;
4344
4345 if (!anv_cmd_buffer_is_render_queue(cmd_buffer)) {
4346 assert(!"Trying to start a render pass on non-render queue!");
4347 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
4348 return;
4349 }
4350
4351 anv_measure_beginrenderpass(cmd_buffer);
4352 trace_intel_begin_render_pass(&cmd_buffer->trace);
4353
4354 gfx->rendering_flags = pRenderingInfo->flags;
4355 gfx->view_mask = pRenderingInfo->viewMask;
4356 gfx->layer_count = pRenderingInfo->layerCount;
4357 gfx->samples = 0;
4358
4359 if (gfx->render_area.offset.x != pRenderingInfo->renderArea.offset.x ||
4360 gfx->render_area.offset.y != pRenderingInfo->renderArea.offset.y ||
4361 gfx->render_area.extent.width != pRenderingInfo->renderArea.extent.width ||
4362 gfx->render_area.extent.height != pRenderingInfo->renderArea.extent.height) {
4363 gfx->render_area = pRenderingInfo->renderArea;
4364 gfx->dirty |= ANV_CMD_DIRTY_RENDER_AREA;
4365 }
4366
4367 const bool is_multiview = gfx->view_mask != 0;
4368 const VkRect2D render_area = gfx->render_area;
4369 const uint32_t layers =
4370 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
4371
4372 /* The framebuffer size is at least large enough to contain the render
4373 * area. Because a zero renderArea is possible, we MAX with 1.
4374 */
4375 struct isl_extent3d fb_size = {
4376 .w = MAX2(1, render_area.offset.x + render_area.extent.width),
4377 .h = MAX2(1, render_area.offset.y + render_area.extent.height),
4378 .d = layers,
4379 };
4380
4381 const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
4382 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
4383 if (result != VK_SUCCESS)
4384 return;
4385
4386 genX(flush_pipeline_select_3d)(cmd_buffer);
4387
4388 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
4389 if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
4390 continue;
4391
4392 const VkRenderingAttachmentInfo *att =
4393 &pRenderingInfo->pColorAttachments[i];
4394 ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
4395 const VkImageLayout initial_layout = attachment_initial_layout(att);
4396
4397 assert(render_area.offset.x + render_area.extent.width <=
4398 iview->vk.extent.width);
4399 assert(render_area.offset.y + render_area.extent.height <=
4400 iview->vk.extent.height);
4401 assert(layers <= iview->vk.layer_count);
4402
4403 fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
4404 fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
4405
4406 assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
4407 gfx->samples |= iview->vk.image->samples;
4408
4409 enum isl_aux_usage aux_usage =
4410 anv_layout_to_aux_usage(cmd_buffer->device->info,
4411 iview->image,
4412 VK_IMAGE_ASPECT_COLOR_BIT,
4413 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
4414 att->imageLayout,
4415 cmd_buffer->queue_family->queueFlags);
4416
4417 union isl_color_value fast_clear_color = { .u32 = { 0, } };
4418
4419 if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
4420 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
4421 const union isl_color_value clear_color =
4422 vk_to_isl_color_with_format(att->clearValue.color,
4423 iview->planes[0].isl.format);
4424
4425 /* We only support fast-clears on the first layer */
4426 const bool fast_clear =
4427 (!is_multiview || (gfx->view_mask & 1)) &&
4428 anv_can_fast_clear_color_view(cmd_buffer->device, iview,
4429 att->imageLayout, clear_color,
4430 layers, render_area,
4431 cmd_buffer->queue_family->queueFlags);
4432
4433 if (att->imageLayout != initial_layout) {
4434 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
4435 render_area.extent.width == iview->vk.extent.width &&
4436 render_area.extent.height == iview->vk.extent.height);
4437 if (is_multiview) {
4438 u_foreach_bit(view, gfx->view_mask) {
4439 transition_color_buffer(cmd_buffer, iview->image,
4440 VK_IMAGE_ASPECT_COLOR_BIT,
4441 iview->vk.base_mip_level, 1,
4442 iview->vk.base_array_layer + view,
4443 1, /* layer_count */
4444 initial_layout, att->imageLayout,
4445 VK_QUEUE_FAMILY_IGNORED,
4446 VK_QUEUE_FAMILY_IGNORED,
4447 fast_clear);
4448 }
4449 } else {
4450 transition_color_buffer(cmd_buffer, iview->image,
4451 VK_IMAGE_ASPECT_COLOR_BIT,
4452 iview->vk.base_mip_level, 1,
4453 iview->vk.base_array_layer,
4454 gfx->layer_count,
4455 initial_layout, att->imageLayout,
4456 VK_QUEUE_FAMILY_IGNORED,
4457 VK_QUEUE_FAMILY_IGNORED,
4458 fast_clear);
4459 }
4460 }
4461
4462 uint32_t clear_view_mask = pRenderingInfo->viewMask;
4463 uint32_t base_clear_layer = iview->vk.base_array_layer;
4464 uint32_t clear_layer_count = gfx->layer_count;
4465 if (fast_clear) {
4466 /* We only support fast-clears on the first layer */
4467 assert(iview->vk.base_mip_level == 0 &&
4468 iview->vk.base_array_layer == 0);
4469
4470 fast_clear_color = clear_color;
4471
4472 if (iview->image->vk.samples == 1) {
4473 anv_image_ccs_op(cmd_buffer, iview->image,
4474 iview->planes[0].isl.format,
4475 iview->planes[0].isl.swizzle,
4476 VK_IMAGE_ASPECT_COLOR_BIT,
4477 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
4478 &fast_clear_color,
4479 false);
4480 } else {
4481 anv_image_mcs_op(cmd_buffer, iview->image,
4482 iview->planes[0].isl.format,
4483 iview->planes[0].isl.swizzle,
4484 VK_IMAGE_ASPECT_COLOR_BIT,
4485 0, 1, ISL_AUX_OP_FAST_CLEAR,
4486 &fast_clear_color,
4487 false);
4488 }
4489 clear_view_mask &= ~1u;
4490 base_clear_layer++;
4491 clear_layer_count--;
4492
4493 genX(set_fast_clear_state)(cmd_buffer, iview->image,
4494 iview->planes[0].isl.format,
4495 clear_color);
4496 }
4497
4498 if (is_multiview) {
4499 u_foreach_bit(view, clear_view_mask) {
4500 anv_image_clear_color(cmd_buffer, iview->image,
4501 VK_IMAGE_ASPECT_COLOR_BIT,
4502 aux_usage,
4503 iview->planes[0].isl.format,
4504 iview->planes[0].isl.swizzle,
4505 iview->vk.base_mip_level,
4506 iview->vk.base_array_layer + view, 1,
4507 render_area, clear_color);
4508 }
4509 } else {
4510 anv_image_clear_color(cmd_buffer, iview->image,
4511 VK_IMAGE_ASPECT_COLOR_BIT,
4512 aux_usage,
4513 iview->planes[0].isl.format,
4514 iview->planes[0].isl.swizzle,
4515 iview->vk.base_mip_level,
4516 base_clear_layer, clear_layer_count,
4517 render_area, clear_color);
4518 }
4519 } else {
4520 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
4521 assert(att->imageLayout == initial_layout);
4522 }
4523
4524 gfx->color_att[i].vk_format = iview->vk.format;
4525 gfx->color_att[i].iview = iview;
4526 gfx->color_att[i].layout = att->imageLayout;
4527 gfx->color_att[i].aux_usage = aux_usage;
4528
4529 struct isl_view isl_view = iview->planes[0].isl;
4530 if (pRenderingInfo->viewMask) {
4531 assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
4532 isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
4533 } else {
4534 assert(isl_view.array_len >= pRenderingInfo->layerCount);
4535 isl_view.array_len = pRenderingInfo->layerCount;
4536 }
4537
4538 anv_image_fill_surface_state(cmd_buffer->device,
4539 iview->image,
4540 VK_IMAGE_ASPECT_COLOR_BIT,
4541 &isl_view,
4542 ISL_SURF_USAGE_RENDER_TARGET_BIT,
4543 aux_usage, &fast_clear_color,
4544 0, /* anv_image_view_state_flags */
4545 &gfx->color_att[i].surface_state);
4546
4547 add_surface_state_relocs(cmd_buffer, &gfx->color_att[i].surface_state);
4548
4549 if (GFX_VER < 10 &&
4550 (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
4551 (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
4552 iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
4553 iview->planes[0].isl.base_level == 0 &&
4554 iview->planes[0].isl.base_array_layer == 0) {
4555 genX(load_image_clear_color)(cmd_buffer,
4556 gfx->color_att[i].surface_state.state,
4557 iview->image);
4558 }
4559
4560 if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
4561 gfx->color_att[i].resolve_mode = att->resolveMode;
4562 gfx->color_att[i].resolve_iview =
4563 anv_image_view_from_handle(att->resolveImageView);
4564 gfx->color_att[i].resolve_layout = att->resolveImageLayout;
4565 }
4566 }
4567
4568 anv_cmd_graphic_state_update_has_uint_rt(gfx);
4569
4570 const struct anv_image_view *fsr_iview = NULL;
4571 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
4572 vk_find_struct_const(pRenderingInfo->pNext,
4573 RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
4574 if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
4575 fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
4576 /* imageLayout and shadingRateAttachmentTexelSize are ignored */
4577 }
4578
4579 const struct anv_image_view *ds_iview = NULL;
4580 const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
4581 const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
4582 if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
4583 (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
4584 const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
4585 VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
4586 VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
4587 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
4588 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
4589 enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
4590 enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
4591 float depth_clear_value = 0;
4592 uint32_t stencil_clear_value = 0;
4593
4594 if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
4595 d_iview = anv_image_view_from_handle(d_att->imageView);
4596 initial_depth_layout = attachment_initial_layout(d_att);
4597 depth_layout = d_att->imageLayout;
4598 depth_aux_usage =
4599 anv_layout_to_aux_usage(cmd_buffer->device->info,
4600 d_iview->image,
4601 VK_IMAGE_ASPECT_DEPTH_BIT,
4602 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
4603 depth_layout,
4604 cmd_buffer->queue_family->queueFlags);
4605 depth_clear_value = d_att->clearValue.depthStencil.depth;
4606 }
4607
4608 if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
4609 s_iview = anv_image_view_from_handle(s_att->imageView);
4610 initial_stencil_layout = attachment_initial_layout(s_att);
4611 stencil_layout = s_att->imageLayout;
4612 stencil_aux_usage =
4613 anv_layout_to_aux_usage(cmd_buffer->device->info,
4614 s_iview->image,
4615 VK_IMAGE_ASPECT_STENCIL_BIT,
4616 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
4617 stencil_layout,
4618 cmd_buffer->queue_family->queueFlags);
4619 stencil_clear_value = s_att->clearValue.depthStencil.stencil;
4620 }
4621
4622 assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
4623 ds_iview = d_iview != NULL ? d_iview : s_iview;
4624 assert(ds_iview != NULL);
4625
4626 assert(render_area.offset.x + render_area.extent.width <=
4627 ds_iview->vk.extent.width);
4628 assert(render_area.offset.y + render_area.extent.height <=
4629 ds_iview->vk.extent.height);
4630 assert(layers <= ds_iview->vk.layer_count);
4631
4632 fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
4633 fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
4634
4635 assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
4636 gfx->samples |= ds_iview->vk.image->samples;
4637
4638 VkImageAspectFlags clear_aspects = 0;
4639 if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
4640 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
4641 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
4642 if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
4643 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
4644 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
4645
4646 if (clear_aspects != 0) {
4647 const bool hiz_clear =
4648 anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
4649 depth_layout, clear_aspects,
4650 depth_clear_value,
4651 render_area,
4652 cmd_buffer->queue_family->queueFlags);
4653
4654 if (depth_layout != initial_depth_layout) {
4655 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
4656 render_area.extent.width == d_iview->vk.extent.width &&
4657 render_area.extent.height == d_iview->vk.extent.height);
4658
4659 if (is_multiview) {
4660 u_foreach_bit(view, gfx->view_mask) {
4661 transition_depth_buffer(cmd_buffer, d_iview->image,
4662 d_iview->vk.base_array_layer + view,
4663 1 /* layer_count */,
4664 initial_depth_layout, depth_layout,
4665 hiz_clear);
4666 }
4667 } else {
4668 transition_depth_buffer(cmd_buffer, d_iview->image,
4669 d_iview->vk.base_array_layer,
4670 gfx->layer_count,
4671 initial_depth_layout, depth_layout,
4672 hiz_clear);
4673 }
4674 }
4675
4676 if (stencil_layout != initial_stencil_layout) {
4677 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
4678 render_area.extent.width == s_iview->vk.extent.width &&
4679 render_area.extent.height == s_iview->vk.extent.height);
4680
4681 if (is_multiview) {
4682 u_foreach_bit(view, gfx->view_mask) {
4683 transition_stencil_buffer(cmd_buffer, s_iview->image,
4684 s_iview->vk.base_mip_level, 1,
4685 s_iview->vk.base_array_layer + view,
4686 1 /* layer_count */,
4687 initial_stencil_layout,
4688 stencil_layout,
4689 hiz_clear);
4690 }
4691 } else {
4692 transition_stencil_buffer(cmd_buffer, s_iview->image,
4693 s_iview->vk.base_mip_level, 1,
4694 s_iview->vk.base_array_layer,
4695 gfx->layer_count,
4696 initial_stencil_layout,
4697 stencil_layout,
4698 hiz_clear);
4699 }
4700 }
4701
4702 if (is_multiview) {
4703 uint32_t clear_view_mask = pRenderingInfo->viewMask;
4704 while (clear_view_mask) {
4705 int view = u_bit_scan(&clear_view_mask);
4706
4707 uint32_t level = ds_iview->vk.base_mip_level;
4708 uint32_t layer = ds_iview->vk.base_array_layer + view;
4709
4710 if (hiz_clear) {
4711 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
4712 clear_aspects,
4713 level, layer, 1,
4714 render_area,
4715 stencil_clear_value);
4716 } else {
4717 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
4718 clear_aspects,
4719 depth_aux_usage,
4720 level, layer, 1,
4721 render_area,
4722 depth_clear_value,
4723 stencil_clear_value);
4724 }
4725 }
4726 } else {
4727 uint32_t level = ds_iview->vk.base_mip_level;
4728 uint32_t base_layer = ds_iview->vk.base_array_layer;
4729 uint32_t layer_count = gfx->layer_count;
4730
4731 if (hiz_clear) {
4732 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
4733 clear_aspects,
4734 level, base_layer, layer_count,
4735 render_area,
4736 stencil_clear_value);
4737 } else {
4738 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
4739 clear_aspects,
4740 depth_aux_usage,
4741 level, base_layer, layer_count,
4742 render_area,
4743 depth_clear_value,
4744 stencil_clear_value);
4745 }
4746 }
4747 } else {
4748 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
4749 assert(depth_layout == initial_depth_layout);
4750 assert(stencil_layout == initial_stencil_layout);
4751 }
4752
4753 if (d_iview != NULL) {
4754 gfx->depth_att.vk_format = d_iview->vk.format;
4755 gfx->depth_att.iview = d_iview;
4756 gfx->depth_att.layout = depth_layout;
4757 gfx->depth_att.aux_usage = depth_aux_usage;
4758 if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
4759 assert(d_att->resolveImageView != VK_NULL_HANDLE);
4760 gfx->depth_att.resolve_mode = d_att->resolveMode;
4761 gfx->depth_att.resolve_iview =
4762 anv_image_view_from_handle(d_att->resolveImageView);
4763 gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
4764 }
4765 }
4766
4767 if (s_iview != NULL) {
4768 gfx->stencil_att.vk_format = s_iview->vk.format;
4769 gfx->stencil_att.iview = s_iview;
4770 gfx->stencil_att.layout = stencil_layout;
4771 gfx->stencil_att.aux_usage = stencil_aux_usage;
4772 if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
4773 assert(s_att->resolveImageView != VK_NULL_HANDLE);
4774 gfx->stencil_att.resolve_mode = s_att->resolveMode;
4775 gfx->stencil_att.resolve_iview =
4776 anv_image_view_from_handle(s_att->resolveImageView);
4777 gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
4778 }
4779 }
4780 }
4781
4782 /* Finally, now that we know the right size, set up the null surface */
4783 assert(util_bitcount(gfx->samples) <= 1);
4784 isl_null_fill_state(&cmd_buffer->device->isl_dev,
4785 gfx->null_surface_state.map,
4786 .size = fb_size);
4787
4788 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
4789 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
4790 continue;
4791
4792 isl_null_fill_state(&cmd_buffer->device->isl_dev,
4793 gfx->color_att[i].surface_state.state.map,
4794 .size = fb_size);
4795 }
4796
4797 /****** We can now start emitting code to begin the render pass ******/
4798
4799 gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
4800
4801 /* It is possible to start a render pass with an old pipeline. Because the
4802 * render pass and subpass index are both baked into the pipeline, this is
4803 * highly unlikely. In order to do so, it requires that you have a render
4804 * pass with a single subpass and that you use that render pass twice
4805 * back-to-back and use the same pipeline at the start of the second render
4806 * pass as at the end of the first. In order to avoid unpredictable issues
4807 * with this edge case, we just dirty the pipeline at the start of every
4808 * subpass.
4809 */
4810 gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
4811
4812 #if GFX_VER >= 11
4813 bool has_color_att = false;
4814 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
4815 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE) {
4816 has_color_att = true;
4817 break;
4818 }
4819 }
4820 if (has_color_att) {
4821 /* The PIPE_CONTROL command description says:
4822 *
4823 * "Whenever a Binding Table Index (BTI) used by a Render Target Message
4824 * points to a different RENDER_SURFACE_STATE, SW must issue a Render
4825 * Target Cache Flush by enabling this bit. When render target flush
4826 * is set due to new association of BTI, PS Scoreboard Stall bit must
4827 * be set in this packet."
4828 *
4829 * We assume that a new BeginRendering is always changing the RTs, which
4830 * may not be true and cause excessive flushing. We can trivially skip it
4831 * in the case that there are no RTs (depth-only rendering), though.
4832 */
4833 anv_add_pending_pipe_bits(cmd_buffer,
4834 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4835 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
4836 "change RT");
4837 }
4838 #endif
4839
4840 cmd_buffer_emit_depth_stencil(cmd_buffer);
4841
4842 cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
4843 }
4844
4845 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)4846 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
4847 struct anv_attachment *att,
4848 VkImageAspectFlagBits aspect)
4849 {
4850 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4851 const struct anv_image_view *iview = att->iview;
4852
4853 if (iview == NULL)
4854 return;
4855
4856 if (gfx->view_mask == 0) {
4857 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
4858 aspect, att->aux_usage,
4859 iview->planes[0].isl.base_level,
4860 iview->planes[0].isl.base_array_layer,
4861 gfx->layer_count);
4862 } else {
4863 uint32_t res_view_mask = gfx->view_mask;
4864 while (res_view_mask) {
4865 int i = u_bit_scan(&res_view_mask);
4866
4867 const uint32_t level = iview->planes[0].isl.base_level;
4868 const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
4869
4870 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
4871 aspect, att->aux_usage,
4872 level, layer, 1);
4873 }
4874 }
4875 }
4876
4877 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)4878 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
4879 {
4880 switch (vk_mode) {
4881 case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
4882 return BLORP_FILTER_SAMPLE_0;
4883 case VK_RESOLVE_MODE_AVERAGE_BIT:
4884 return BLORP_FILTER_AVERAGE;
4885 case VK_RESOLVE_MODE_MIN_BIT:
4886 return BLORP_FILTER_MIN_SAMPLE;
4887 case VK_RESOLVE_MODE_MAX_BIT:
4888 return BLORP_FILTER_MAX_SAMPLE;
4889 default:
4890 return BLORP_FILTER_NONE;
4891 }
4892 }
4893
4894 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer * cmd_buffer,const struct anv_attachment * att,VkImageLayout layout,VkImageAspectFlagBits aspect)4895 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
4896 const struct anv_attachment *att,
4897 VkImageLayout layout,
4898 VkImageAspectFlagBits aspect)
4899 {
4900 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4901 const struct anv_image_view *src_iview = att->iview;
4902 const struct anv_image_view *dst_iview = att->resolve_iview;
4903
4904 enum isl_aux_usage src_aux_usage =
4905 anv_layout_to_aux_usage(cmd_buffer->device->info,
4906 src_iview->image, aspect,
4907 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
4908 layout,
4909 cmd_buffer->queue_family->queueFlags);
4910
4911 enum isl_aux_usage dst_aux_usage =
4912 anv_layout_to_aux_usage(cmd_buffer->device->info,
4913 dst_iview->image, aspect,
4914 VK_IMAGE_USAGE_TRANSFER_DST_BIT,
4915 att->resolve_layout,
4916 cmd_buffer->queue_family->queueFlags);
4917
4918 enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
4919
4920 const VkRect2D render_area = gfx->render_area;
4921 if (gfx->view_mask == 0) {
4922 anv_image_msaa_resolve(cmd_buffer,
4923 src_iview->image, src_aux_usage,
4924 src_iview->planes[0].isl.base_level,
4925 src_iview->planes[0].isl.base_array_layer,
4926 dst_iview->image, dst_aux_usage,
4927 dst_iview->planes[0].isl.base_level,
4928 dst_iview->planes[0].isl.base_array_layer,
4929 aspect,
4930 render_area.offset.x, render_area.offset.y,
4931 render_area.offset.x, render_area.offset.y,
4932 render_area.extent.width,
4933 render_area.extent.height,
4934 gfx->layer_count, filter);
4935 } else {
4936 uint32_t res_view_mask = gfx->view_mask;
4937 while (res_view_mask) {
4938 int i = u_bit_scan(&res_view_mask);
4939
4940 anv_image_msaa_resolve(cmd_buffer,
4941 src_iview->image, src_aux_usage,
4942 src_iview->planes[0].isl.base_level,
4943 src_iview->planes[0].isl.base_array_layer + i,
4944 dst_iview->image, dst_aux_usage,
4945 dst_iview->planes[0].isl.base_level,
4946 dst_iview->planes[0].isl.base_array_layer + i,
4947 aspect,
4948 render_area.offset.x, render_area.offset.y,
4949 render_area.offset.x, render_area.offset.y,
4950 render_area.extent.width,
4951 render_area.extent.height,
4952 1, filter);
4953 }
4954 }
4955 }
4956
genX(CmdEndRendering)4957 void genX(CmdEndRendering)(
4958 VkCommandBuffer commandBuffer)
4959 {
4960 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4961 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4962
4963 if (anv_batch_has_error(&cmd_buffer->batch))
4964 return;
4965
4966 const bool is_multiview = gfx->view_mask != 0;
4967 const uint32_t layers =
4968 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
4969
4970 bool has_color_resolve = false;
4971 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
4972 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
4973 VK_IMAGE_ASPECT_COLOR_BIT);
4974
4975 /* Stash this off for later */
4976 if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
4977 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
4978 has_color_resolve = true;
4979 }
4980
4981 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
4982 VK_IMAGE_ASPECT_DEPTH_BIT);
4983
4984 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
4985 VK_IMAGE_ASPECT_STENCIL_BIT);
4986
4987 if (has_color_resolve) {
4988 /* We are about to do some MSAA resolves. We need to flush so that the
4989 * result of writes to the MSAA color attachments show up in the sampler
4990 * when we blit to the single-sampled resolve target.
4991 */
4992 anv_add_pending_pipe_bits(cmd_buffer,
4993 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4994 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
4995 "MSAA resolve");
4996 }
4997
4998 if (!(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT) &&
4999 (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
5000 gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)) {
5001 /* We are about to do some MSAA resolves. We need to flush so that the
5002 * result of writes to the MSAA depth attachments show up in the sampler
5003 * when we blit to the single-sampled resolve target.
5004 */
5005 anv_add_pending_pipe_bits(cmd_buffer,
5006 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5007 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5008 "MSAA resolve");
5009 }
5010
5011 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5012 const struct anv_attachment *att = &gfx->color_att[i];
5013 if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
5014 (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5015 continue;
5016
5017 cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
5018 VK_IMAGE_ASPECT_COLOR_BIT);
5019 }
5020
5021 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5022 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5023 const struct anv_image_view *src_iview = gfx->depth_att.iview;
5024
5025 /* MSAA resolves sample from the source attachment. Transition the
5026 * depth attachment first to get rid of any HiZ that we may not be
5027 * able to handle.
5028 */
5029 transition_depth_buffer(cmd_buffer, src_iview->image,
5030 src_iview->planes[0].isl.base_array_layer,
5031 layers,
5032 gfx->depth_att.layout,
5033 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5034 false /* will_full_fast_clear */);
5035
5036 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
5037 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5038 VK_IMAGE_ASPECT_DEPTH_BIT);
5039
5040 /* Transition the source back to the original layout. This seems a bit
5041 * inefficient but, since HiZ resolves aren't destructive, going from
5042 * less HiZ to more is generally a no-op.
5043 */
5044 transition_depth_buffer(cmd_buffer, src_iview->image,
5045 src_iview->planes[0].isl.base_array_layer,
5046 layers,
5047 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5048 gfx->depth_att.layout,
5049 false /* will_full_fast_clear */);
5050 }
5051
5052 if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5053 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5054 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
5055 gfx->stencil_att.layout,
5056 VK_IMAGE_ASPECT_STENCIL_BIT);
5057 }
5058
5059
5060 trace_intel_end_render_pass(&cmd_buffer->trace,
5061 gfx->render_area.extent.width,
5062 gfx->render_area.extent.height,
5063 gfx->color_att_count,
5064 gfx->samples);
5065
5066 anv_cmd_buffer_reset_rendering(cmd_buffer);
5067 }
5068
5069 void
genX(cmd_emit_conditional_render_predicate)5070 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5071 {
5072 struct mi_builder b;
5073 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5074
5075 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5076 mi_reg32(ANV_PREDICATE_RESULT_REG));
5077 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5078
5079 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5080 mip.LoadOperation = LOAD_LOADINV;
5081 mip.CombineOperation = COMBINE_SET;
5082 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5083 }
5084 }
5085
genX(CmdBeginConditionalRenderingEXT)5086 void genX(CmdBeginConditionalRenderingEXT)(
5087 VkCommandBuffer commandBuffer,
5088 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
5089 {
5090 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5091 ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5092 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5093 struct anv_address value_address =
5094 anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5095
5096 const bool isInverted = pConditionalRenderingBegin->flags &
5097 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5098
5099 cmd_state->conditional_render_enabled = true;
5100
5101 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5102
5103 struct mi_builder b;
5104 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5105 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &value_address);
5106 mi_builder_set_mocs(&b, mocs);
5107
5108 /* Section 19.4 of the Vulkan 1.1.85 spec says:
5109 *
5110 * If the value of the predicate in buffer memory changes
5111 * while conditional rendering is active, the rendering commands
5112 * may be discarded in an implementation-dependent way.
5113 * Some implementations may latch the value of the predicate
5114 * upon beginning conditional rendering while others
5115 * may read it before every rendering command.
5116 *
5117 * So it's perfectly fine to read a value from the buffer once.
5118 */
5119 struct mi_value value = mi_mem32(value_address);
5120
5121 /* Precompute predicate result, it is necessary to support secondary
5122 * command buffers since it is unknown if conditional rendering is
5123 * inverted when populating them.
5124 */
5125 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5126 isInverted ? mi_uge(&b, mi_imm(0), value) :
5127 mi_ult(&b, mi_imm(0), value));
5128 }
5129
genX(CmdEndConditionalRenderingEXT)5130 void genX(CmdEndConditionalRenderingEXT)(
5131 VkCommandBuffer commandBuffer)
5132 {
5133 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5134 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5135
5136 cmd_state->conditional_render_enabled = false;
5137 }
5138
5139 /* Set of stage bits for which are pipelined, i.e. they get queued
5140 * by the command streamer for later execution.
5141 */
5142 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5143 ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5144 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5145 VK_PIPELINE_STAGE_2_HOST_BIT | \
5146 VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5147
genX(CmdSetEvent2)5148 void genX(CmdSetEvent2)(
5149 VkCommandBuffer commandBuffer,
5150 VkEvent _event,
5151 const VkDependencyInfo* pDependencyInfo)
5152 {
5153 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5154 ANV_FROM_HANDLE(anv_event, event, _event);
5155
5156 if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
5157 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5158 flush.PostSyncOperation = WriteImmediateData;
5159 flush.Address = anv_state_pool_state_address(
5160 &cmd_buffer->device->dynamic_state_pool,
5161 event->state);
5162 flush.ImmediateData = VK_EVENT_SET;
5163 }
5164 return;
5165 }
5166
5167 VkPipelineStageFlags2 src_stages = 0;
5168
5169 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
5170 src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
5171 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
5172 src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
5173 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
5174 src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
5175
5176 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5177 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5178
5179 enum anv_pipe_bits pc_bits = 0;
5180 if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5181 pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
5182 pc_bits |= ANV_PIPE_CS_STALL_BIT;
5183 }
5184
5185 genx_batch_emit_pipe_control_write
5186 (&cmd_buffer->batch, cmd_buffer->device->info,
5187 cmd_buffer->state.current_pipeline, WriteImmediateData,
5188 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5189 event->state),
5190 VK_EVENT_SET, pc_bits);
5191 }
5192
genX(CmdResetEvent2)5193 void genX(CmdResetEvent2)(
5194 VkCommandBuffer commandBuffer,
5195 VkEvent _event,
5196 VkPipelineStageFlags2 stageMask)
5197 {
5198 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5199 ANV_FROM_HANDLE(anv_event, event, _event);
5200
5201 if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
5202 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
5203 flush.PostSyncOperation = WriteImmediateData;
5204 flush.Address = anv_state_pool_state_address(
5205 &cmd_buffer->device->dynamic_state_pool,
5206 event->state);
5207 flush.ImmediateData = VK_EVENT_RESET;
5208 }
5209 return;
5210 }
5211
5212 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5213 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5214
5215 enum anv_pipe_bits pc_bits = 0;
5216 if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5217 pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
5218 pc_bits |= ANV_PIPE_CS_STALL_BIT;
5219 }
5220
5221 genx_batch_emit_pipe_control_write
5222 (&cmd_buffer->batch, cmd_buffer->device->info,
5223 cmd_buffer->state.current_pipeline, WriteImmediateData,
5224 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5225 event->state),
5226 VK_EVENT_RESET,
5227 pc_bits);
5228 }
5229
genX(CmdWaitEvents2)5230 void genX(CmdWaitEvents2)(
5231 VkCommandBuffer commandBuffer,
5232 uint32_t eventCount,
5233 const VkEvent* pEvents,
5234 const VkDependencyInfo* pDependencyInfos)
5235 {
5236 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5237
5238 for (uint32_t i = 0; i < eventCount; i++) {
5239 ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
5240
5241 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5242 sem.WaitMode = PollingMode;
5243 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
5244 sem.SemaphoreDataDword = VK_EVENT_SET;
5245 sem.SemaphoreAddress = anv_state_pool_state_address(
5246 &cmd_buffer->device->dynamic_state_pool,
5247 event->state);
5248 }
5249 }
5250
5251 cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
5252 }
5253
vk_to_intel_index_type(VkIndexType type)5254 static uint32_t vk_to_intel_index_type(VkIndexType type)
5255 {
5256 switch (type) {
5257 case VK_INDEX_TYPE_UINT8_KHR:
5258 return INDEX_BYTE;
5259 case VK_INDEX_TYPE_UINT16:
5260 return INDEX_WORD;
5261 case VK_INDEX_TYPE_UINT32:
5262 return INDEX_DWORD;
5263 default:
5264 unreachable("invalid index type");
5265 }
5266 }
5267
genX(CmdBindIndexBuffer2KHR)5268 void genX(CmdBindIndexBuffer2KHR)(
5269 VkCommandBuffer commandBuffer,
5270 VkBuffer _buffer,
5271 VkDeviceSize offset,
5272 VkDeviceSize size,
5273 VkIndexType indexType)
5274 {
5275 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5276 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5277
5278 uint32_t restart_index = vk_index_to_restart(indexType);
5279 if (cmd_buffer->state.gfx.restart_index != restart_index) {
5280 cmd_buffer->state.gfx.restart_index = restart_index;
5281 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RESTART_INDEX;
5282 }
5283
5284 uint32_t index_type = vk_to_intel_index_type(indexType);
5285 if (cmd_buffer->state.gfx.index_buffer != buffer ||
5286 cmd_buffer->state.gfx.index_type != index_type ||
5287 cmd_buffer->state.gfx.index_offset != offset) {
5288 cmd_buffer->state.gfx.index_buffer = buffer;
5289 cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
5290 cmd_buffer->state.gfx.index_offset = offset;
5291 cmd_buffer->state.gfx.index_size = buffer ? vk_buffer_range(&buffer->vk, offset, size) : 0;
5292 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
5293 }
5294 }
5295
genX(CmdSetPerformanceOverrideINTEL)5296 VkResult genX(CmdSetPerformanceOverrideINTEL)(
5297 VkCommandBuffer commandBuffer,
5298 const VkPerformanceOverrideInfoINTEL* pOverrideInfo)
5299 {
5300 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5301
5302 switch (pOverrideInfo->type) {
5303 case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
5304 anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
5305 csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
5306 csdm2.MediaInstructionDisable = pOverrideInfo->enable;
5307 csdm2._3DRenderingInstructionDisableMask = true;
5308 csdm2.MediaInstructionDisableMask = true;
5309 }
5310 break;
5311 }
5312
5313 case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
5314 if (pOverrideInfo->enable) {
5315 /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
5316 anv_add_pending_pipe_bits(cmd_buffer,
5317 ANV_PIPE_FLUSH_BITS |
5318 ANV_PIPE_INVALIDATE_BITS,
5319 "perf counter isolation");
5320 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5321 }
5322 break;
5323
5324 default:
5325 unreachable("Invalid override");
5326 }
5327
5328 return VK_SUCCESS;
5329 }
5330
genX(CmdSetPerformanceStreamMarkerINTEL)5331 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
5332 VkCommandBuffer commandBuffer,
5333 const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)
5334 {
5335 /* TODO: Waiting on the register to write, might depend on generation. */
5336
5337 return VK_SUCCESS;
5338 }
5339
5340 #define TIMESTAMP 0x2358
5341
genX(cmd_emit_timestamp)5342 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
5343 struct anv_device *device,
5344 struct anv_address addr,
5345 enum anv_timestamp_capture_type type,
5346 void *data) {
5347 /* Make sure ANV_TIMESTAMP_CAPTURE_AT_CS_STALL and
5348 * ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER capture type are not set for
5349 * transfer queue.
5350 */
5351 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
5352 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
5353 assert(type != ANV_TIMESTAMP_CAPTURE_AT_CS_STALL &&
5354 type != ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER);
5355 }
5356
5357 switch (type) {
5358 case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
5359 struct mi_builder b;
5360 mi_builder_init(&b, device->info, batch);
5361 mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
5362 break;
5363 }
5364
5365 case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: {
5366 if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
5367 (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
5368 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
5369 if (intel_needs_workaround(device->info, 16018063123))
5370 genX(batch_emit_fast_color_dummy_blit)(batch, device);
5371 anv_batch_emit(batch, GENX(MI_FLUSH_DW), fd) {
5372 fd.PostSyncOperation = WriteTimestamp;
5373 fd.Address = addr;
5374 }
5375 } else {
5376 genx_batch_emit_pipe_control_write(batch, device->info, 0,
5377 WriteTimestamp, addr, 0, 0);
5378 }
5379 break;
5380 }
5381
5382 case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
5383 genx_batch_emit_pipe_control_write
5384 (batch, device->info, 0, WriteTimestamp, addr, 0,
5385 ANV_PIPE_CS_STALL_BIT);
5386 break;
5387
5388 #if GFX_VERx10 >= 125
5389 case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
5390 uint32_t dwords[GENX(COMPUTE_WALKER_length)];
5391
5392 GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
5393 .PostSync = (struct GENX(POSTSYNC_DATA)) {
5394 .Operation = WriteTimestamp,
5395 .DestinationAddress = addr,
5396 .MOCS = anv_mocs(device, NULL, 0),
5397 },
5398 });
5399
5400 for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
5401 ((uint32_t *)data)[i] |= dwords[i];
5402 break;
5403 }
5404
5405 case ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH: {
5406 uint32_t dwords[GENX(EXECUTE_INDIRECT_DISPATCH_length)];
5407
5408 GENX(EXECUTE_INDIRECT_DISPATCH_pack)
5409 (batch, dwords, &(struct GENX(EXECUTE_INDIRECT_DISPATCH)) {
5410 .MOCS = anv_mocs(device, NULL, 0),
5411 .COMPUTE_WALKER_BODY = {
5412 .PostSync = (struct GENX(POSTSYNC_DATA)) {
5413 .Operation = WriteTimestamp,
5414 .DestinationAddress = addr,
5415 .MOCS = anv_mocs(device, NULL, 0),
5416 },
5417 }
5418 });
5419
5420 for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
5421 ((uint32_t *)data)[i] |= dwords[i];
5422 break;
5423 }
5424 #endif
5425
5426 default:
5427 unreachable("invalid");
5428 }
5429 }
5430
genX(batch_emit_secondary_call)5431 void genX(batch_emit_secondary_call)(struct anv_batch *batch,
5432 struct anv_address secondary_addr,
5433 struct anv_address secondary_return_addr)
5434 {
5435 /* Emit a write to change the return address of the secondary */
5436 uint64_t *write_return_addr =
5437 anv_batch_emitn(batch,
5438 GENX(MI_STORE_DATA_IMM_length) + 1 /* QWord write */,
5439 GENX(MI_STORE_DATA_IMM),
5440 #if GFX_VER >= 12
5441 .ForceWriteCompletionCheck = true,
5442 #endif
5443 .Address = secondary_return_addr) +
5444 GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 8;
5445
5446 #if GFX_VER >= 12
5447 /* Disable prefetcher before jumping into a secondary */
5448 anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
5449 arb.PreParserDisableMask = true;
5450 arb.PreParserDisable = true;
5451 }
5452 #endif
5453
5454 /* Jump into the secondary */
5455 anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
5456 bbs.AddressSpaceIndicator = ASI_PPGTT;
5457 bbs.SecondLevelBatchBuffer = Firstlevelbatch;
5458 bbs.BatchBufferStartAddress = secondary_addr;
5459 }
5460
5461 /* Replace the return address written by the MI_STORE_DATA_IMM above with
5462 * the primary's current batch address (immediately after the jump).
5463 */
5464 *write_return_addr =
5465 anv_address_physical(anv_batch_current_address(batch));
5466 }
5467
5468 void *
genX(batch_emit_return)5469 genX(batch_emit_return)(struct anv_batch *batch)
5470 {
5471 return anv_batch_emitn(batch,
5472 GENX(MI_BATCH_BUFFER_START_length),
5473 GENX(MI_BATCH_BUFFER_START),
5474 .AddressSpaceIndicator = ASI_PPGTT,
5475 .SecondLevelBatchBuffer = Firstlevelbatch);
5476 }
5477
5478 void
genX(batch_emit_post_3dprimitive_was)5479 genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
5480 const struct anv_device *device,
5481 uint32_t primitive_topology,
5482 uint32_t vertex_count)
5483 {
5484 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
5485 if (intel_needs_workaround(device->info, 22014412737) &&
5486 (primitive_topology == _3DPRIM_POINTLIST ||
5487 primitive_topology == _3DPRIM_LINELIST ||
5488 primitive_topology == _3DPRIM_LINESTRIP ||
5489 primitive_topology == _3DPRIM_LINELIST_ADJ ||
5490 primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
5491 primitive_topology == _3DPRIM_LINELOOP ||
5492 primitive_topology == _3DPRIM_POINTLIST_BF ||
5493 primitive_topology == _3DPRIM_LINESTRIP_CONT ||
5494 primitive_topology == _3DPRIM_LINESTRIP_BF ||
5495 primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
5496 (vertex_count == 1 || vertex_count == 2)) {
5497 genx_batch_emit_pipe_control_write
5498 (batch, device->info, 0, WriteImmediateData,
5499 device->workaround_address, 0, 0);
5500
5501 /* Reset counter because we just emitted a PC */
5502 batch->num_3d_primitives_emitted = 0;
5503 } else if (intel_needs_workaround(device->info, 16014538804)) {
5504 batch->num_3d_primitives_emitted++;
5505 /* WA 16014538804:
5506 * After every 3 3D_Primitive command,
5507 * atleast 1 pipe_control must be inserted.
5508 */
5509 if (batch->num_3d_primitives_emitted == 3) {
5510 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
5511 batch->num_3d_primitives_emitted = 0;
5512 }
5513 }
5514 #endif
5515 }
5516
5517 /* Wa_16018063123 */
5518 ALWAYS_INLINE void
genX(batch_emit_fast_color_dummy_blit)5519 genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
5520 struct anv_device *device)
5521 {
5522 #if GFX_VERx10 >= 125
5523 anv_batch_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
5524 blt.DestinationBaseAddress = device->workaround_address;
5525 blt.DestinationMOCS = device->isl_dev.mocs.blitter_dst;
5526 blt.DestinationPitch = 63;
5527 blt.DestinationX2 = 1;
5528 blt.DestinationY2 = 4;
5529 blt.DestinationSurfaceWidth = 1;
5530 blt.DestinationSurfaceHeight = 4;
5531 blt.DestinationSurfaceType = XY_SURFTYPE_2D;
5532 blt.DestinationSurfaceQPitch = 4;
5533 blt.DestinationTiling = XY_TILE_LINEAR;
5534 }
5535 #endif
5536 }
5537
5538 void
genX(urb_workaround)5539 genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
5540 const struct intel_urb_config *urb_cfg)
5541 {
5542 #if INTEL_NEEDS_WA_16014912113
5543 const struct intel_urb_config *current =
5544 &cmd_buffer->state.gfx.urb_cfg;
5545 if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) &&
5546 current->size[0] != 0) {
5547 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
5548 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
5549 urb._3DCommandSubOpcode += i;
5550 urb.VSURBStartingAddress = current->start[i];
5551 urb.VSURBEntryAllocationSize = current->size[i] - 1;
5552 urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
5553 }
5554 }
5555 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5556 pc.HDCPipelineFlushEnable = true;
5557 }
5558 }
5559 #endif
5560 }
5561
5562 struct anv_state
genX(cmd_buffer_begin_companion_rcs_syncpoint)5563 genX(cmd_buffer_begin_companion_rcs_syncpoint)(
5564 struct anv_cmd_buffer *cmd_buffer)
5565 {
5566 #if GFX_VERx10 >= 125
5567 const struct intel_device_info *info = cmd_buffer->device->info;
5568 struct anv_state syncpoint =
5569 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 2 * sizeof(uint32_t), 4);
5570 struct anv_address xcs_wait_addr =
5571 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5572 syncpoint);
5573 struct anv_address rcs_wait_addr = anv_address_add(xcs_wait_addr, 4);
5574
5575 /* Reset the sync point */
5576 memset(syncpoint.map, 0, 2 * sizeof(uint32_t));
5577
5578 struct mi_builder b;
5579
5580 /* On CCS:
5581 * - flush all caches & invalidate
5582 * - unblock RCS
5583 * - wait on RCS to complete
5584 * - clear the value we waited on
5585 */
5586
5587 if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
5588 anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_FLUSH_BITS |
5589 ANV_PIPE_INVALIDATE_BITS |
5590 ANV_PIPE_STALL_BITS,
5591 "post main cmd buffer invalidate");
5592 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5593 } else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
5594 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
5595 if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
5596 genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
5597 cmd_buffer->device);
5598 }
5599 anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
5600 fd.FlushCCS = true; /* Maybe handle Flush LLC */
5601 }
5602 }
5603
5604 {
5605 mi_builder_init(&b, info, &cmd_buffer->batch);
5606 mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x1));
5607 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5608 sem.WaitMode = PollingMode;
5609 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
5610 sem.SemaphoreDataDword = 0x1;
5611 sem.SemaphoreAddress = xcs_wait_addr;
5612 }
5613 /* Make sure to reset the semaphore in case the command buffer is run
5614 * multiple times.
5615 */
5616 mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x0));
5617 }
5618
5619 /* On RCS:
5620 * - wait on CCS signal
5621 * - clear the value we waited on
5622 */
5623 {
5624 mi_builder_init(&b, info, &cmd_buffer->companion_rcs_cmd_buffer->batch);
5625 anv_batch_emit(&cmd_buffer->companion_rcs_cmd_buffer->batch,
5626 GENX(MI_SEMAPHORE_WAIT),
5627 sem) {
5628 sem.WaitMode = PollingMode;
5629 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
5630 sem.SemaphoreDataDword = 0x1;
5631 sem.SemaphoreAddress = rcs_wait_addr;
5632 }
5633 /* Make sure to reset the semaphore in case the command buffer is run
5634 * multiple times.
5635 */
5636 mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x0));
5637 }
5638
5639 return syncpoint;
5640 #else
5641 unreachable("Not implemented");
5642 #endif
5643 }
5644
5645 void
genX(cmd_buffer_end_companion_rcs_syncpoint)5646 genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
5647 struct anv_state syncpoint)
5648 {
5649 #if GFX_VERx10 >= 125
5650 struct anv_address xcs_wait_addr =
5651 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
5652 syncpoint);
5653
5654 struct mi_builder b;
5655
5656 /* On RCS:
5657 * - flush all caches & invalidate
5658 * - unblock the CCS
5659 */
5660 anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
5661 ANV_PIPE_FLUSH_BITS |
5662 ANV_PIPE_INVALIDATE_BITS |
5663 ANV_PIPE_STALL_BITS,
5664 "post rcs flush");
5665 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer->companion_rcs_cmd_buffer);
5666
5667 mi_builder_init(&b, cmd_buffer->device->info,
5668 &cmd_buffer->companion_rcs_cmd_buffer->batch);
5669 mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x1));
5670 #else
5671 unreachable("Not implemented");
5672 #endif
5673 }
5674
5675 VkResult
genX(write_trtt_entries)5676 genX(write_trtt_entries)(struct anv_trtt_submission *submit)
5677 {
5678 #if GFX_VER >= 12
5679 size_t batch_size = submit->l3l2_binds_len * 20 +
5680 submit->l1_binds_len * 16 + 8;
5681 STACK_ARRAY(uint32_t, cmds, batch_size);
5682 struct anv_batch batch = {
5683 .start = cmds,
5684 .next = cmds,
5685 .end = (void *)cmds + batch_size,
5686 };
5687
5688 /* BSpec says:
5689 * "DWord Length programmed must not exceed 0x3FE."
5690 * For a single dword write the programmed length is 2, and for a single
5691 * qword it's 3. This is the value we actually write to the register field,
5692 * so it's not considering the bias.
5693 */
5694 uint32_t dword_write_len = 2;
5695 uint32_t qword_write_len = 3;
5696 uint32_t max_dword_extra_writes = 0x3FE - dword_write_len;
5697 uint32_t max_qword_extra_writes = (0x3FE - qword_write_len) / 2;
5698
5699 /* What makes the code below quite complicated is the fact that we can
5700 * write multiple values with MI_STORE_DATA_IMM as long as the writes go to
5701 * contiguous addresses.
5702 */
5703
5704 for (int i = 0; i < submit->l3l2_binds_len; i++) {
5705 int extra_writes = 0;
5706 for (int j = i + 1;
5707 j < submit->l3l2_binds_len &&
5708 extra_writes <= max_qword_extra_writes;
5709 j++) {
5710 if (submit->l3l2_binds[i].pte_addr + (j - i) * 8 ==
5711 submit->l3l2_binds[j].pte_addr) {
5712 extra_writes++;
5713 } else {
5714 break;
5715 }
5716 }
5717 bool is_last_write = submit->l1_binds_len == 0 &&
5718 i + extra_writes + 1 == submit->l3l2_binds_len;
5719
5720 uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
5721 qword_write_len + (extra_writes * 2);
5722 uint32_t *dw;
5723 dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
5724 .ForceWriteCompletionCheck = is_last_write,
5725 .StoreQword = true,
5726 .Address = anv_address_from_u64(submit->l3l2_binds[i].pte_addr),
5727 );
5728 dw += 3;
5729 for (int j = 0; j < extra_writes + 1; j++) {
5730 uint64_t entry_addr_64b = submit->l3l2_binds[i + j].entry_addr;
5731 *dw = entry_addr_64b & 0xFFFFFFFF;
5732 dw++;
5733 *dw = (entry_addr_64b >> 32) & 0xFFFFFFFF;
5734 dw++;
5735 }
5736 assert(dw == batch.next);
5737
5738 i += extra_writes;
5739 }
5740
5741 for (int i = 0; i < submit->l1_binds_len; i++) {
5742 int extra_writes = 0;
5743 for (int j = i + 1;
5744 j < submit->l1_binds_len && extra_writes <= max_dword_extra_writes;
5745 j++) {
5746 if (submit->l1_binds[i].pte_addr + (j - i) * 4 ==
5747 submit->l1_binds[j].pte_addr) {
5748 extra_writes++;
5749 } else {
5750 break;
5751 }
5752 }
5753
5754 bool is_last_write = i + extra_writes + 1 == submit->l1_binds_len;
5755
5756 uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
5757 dword_write_len + extra_writes;
5758 uint32_t *dw;
5759 dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
5760 .ForceWriteCompletionCheck = is_last_write,
5761 .Address = anv_address_from_u64(submit->l1_binds[i].pte_addr),
5762 );
5763 dw += 3;
5764 for (int j = 0; j < extra_writes + 1; j++) {
5765 *dw = (submit->l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
5766 dw++;
5767 }
5768 assert(dw == batch.next);
5769
5770 i += extra_writes;
5771 }
5772
5773 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
5774
5775 assert(batch.next <= batch.end);
5776
5777 VkResult result = anv_queue_submit_trtt_batch(submit->sparse, &batch);
5778 STACK_ARRAY_FINISH(cmds);
5779
5780 return result;
5781
5782 #endif
5783 return VK_SUCCESS;
5784 }
5785
5786 void
genX(CmdWriteBufferMarker2AMD)5787 genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer,
5788 VkPipelineStageFlags2 stage,
5789 VkBuffer dstBuffer,
5790 VkDeviceSize dstOffset,
5791 uint32_t marker)
5792 {
5793 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5794 ANV_FROM_HANDLE(anv_buffer, buffer, dstBuffer);
5795
5796 /* The barriers inserted by the application to make dstBuffer writable
5797 * should already have the L1/L2 cache flushes. On platforms where the
5798 * command streamer is not coherent with L3, we need an additional set of
5799 * cache flushes.
5800 */
5801 enum anv_pipe_bits bits =
5802 (ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info) ? 0 :
5803 (ANV_PIPE_DATA_CACHE_FLUSH_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT)) |
5804 ANV_PIPE_END_OF_PIPE_SYNC_BIT;
5805
5806 trace_intel_begin_write_buffer_marker(&cmd_buffer->trace);
5807
5808 anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker");
5809 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5810
5811 struct mi_builder b;
5812 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5813
5814 /* Emitting a PIPE_CONTROL with Post-Sync Op = Write Immediate Data
5815 * would be the logical way to implement this extension, as it could
5816 * do a pipelined marker write. Unfortunately, it requires writing
5817 * whole 64-bit QWords, and VK_AMD_buffer_marker requires writing a
5818 * 32-bit value. MI_STORE_DATA_IMM is the only good way to do that,
5819 * and unfortunately it requires stalling.
5820 */
5821 mi_store(&b, mi_mem32(anv_address_add(buffer->address, dstOffset)),
5822 mi_imm(marker));
5823
5824 trace_intel_end_write_buffer_marker(&cmd_buffer->trace);
5825 }
5826