1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "common/intel_aux_map.h"
27 #include "common/intel_sample_positions.h"
28 #include "common/intel_pixel_hash.h"
29 #include "genxml/gen_macros.h"
30 #include "genxml/genX_pack.h"
31
32 #include "vk_standard_sample_locations.h"
33
34 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT_GRL
35 #include "grl/genX_grl.h"
36 #endif
37
38 #include "genX_mi_builder.h"
39
40 #include "vk_util.h"
41 #include "vk_format.h"
42
43 static void
genX(emit_slice_hashing_state)44 genX(emit_slice_hashing_state)(struct anv_device *device,
45 struct anv_batch *batch)
46 {
47 #if GFX_VER == 11
48 /* Gfx11 hardware has two pixel pipes at most. */
49 for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++)
50 assert(device->info->ppipe_subslices[i] == 0);
51
52 if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1])
53 return;
54
55 if (!device->slice_hash.alloc_size) {
56 unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
57 device->slice_hash =
58 anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
59
60 const bool flip = device->info->ppipe_subslices[0] <
61 device->info->ppipe_subslices[1];
62 struct GENX(SLICE_HASH_TABLE) table;
63 intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
64
65 GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
66 }
67
68 anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
69 ptr.SliceHashStatePointerValid = true;
70 ptr.SliceHashTableStatePointer = device->slice_hash.offset;
71 }
72
73 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
74 mode.SliceHashingTableEnable = true;
75 }
76 #elif GFX_VERx10 == 120
77 /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
78 * present with n active dual subslices.
79 */
80 unsigned ppipes_of[3] = {};
81
82 for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
83 for (unsigned p = 0; p < 3; p++)
84 ppipes_of[n] += (device->info->ppipe_subslices[p] == n);
85 }
86
87 /* Gfx12 has three pixel pipes. */
88 for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++)
89 assert(device->info->ppipe_subslices[p] == 0);
90
91 if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
92 /* All three pixel pipes have the maximum number of active dual
93 * subslices, or there is only one active pixel pipe: Nothing to do.
94 */
95 return;
96 }
97
98 anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
99 p.SliceHashControl[0] = TABLE_0;
100
101 if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
102 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
103 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
104 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
105
106 if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
107 intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
108 else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
109 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
110 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
111 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
112 else
113 unreachable("Illegal fusing.");
114 }
115
116 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
117 p.SubsliceHashingTableEnable = true;
118 p.SubsliceHashingTableEnableMask = true;
119 }
120 #elif GFX_VERx10 == 125
121 /* Calculate the set of present pixel pipes, and another set of
122 * present pixel pipes with 2 dual subslices enabled, the latter
123 * will appear on the hashing table with twice the frequency of
124 * pixel pipes with a single dual subslice present.
125 */
126 uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
127 for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
128 if (device->info->ppipe_subslices[p] > 0)
129 ppipe_mask1 |= (1u << p);
130 if (device->info->ppipe_subslices[p] > 1)
131 ppipe_mask2 |= (1u << p);
132 }
133 assert(ppipe_mask1);
134
135 if (!device->slice_hash.alloc_size) {
136 unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
137 device->slice_hash =
138 anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
139
140 struct GENX(SLICE_HASH_TABLE) table;
141
142 /* Note that the hardware expects an array with 7 tables, each
143 * table is intended to specify the pixel pipe hashing behavior
144 * for every possible slice count between 2 and 8, however that
145 * doesn't actually work, among other reasons due to hardware
146 * bugs that will cause the GPU to erroneously access the table
147 * at the wrong index in some cases, so in practice all 7 tables
148 * need to be initialized to the same value.
149 */
150 for (unsigned i = 0; i < 7; i++)
151 intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
152 table.Entry[i][0]);
153
154 GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
155 }
156
157 anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
158 ptr.SliceHashStatePointerValid = true;
159 ptr.SliceHashTableStatePointer = device->slice_hash.offset;
160 }
161
162 /* TODO: Figure out FCV support for other platforms
163 * Testing indicates that FCV is broken gfx125.
164 * Let's disable FCV for now till we figure out what's wrong.
165 *
166 * Alternatively, it can be toggled off via drirc option 'anv_disable_fcv'.
167 *
168 * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9987
169 * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10318
170 * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10795
171 * Ref: Internal issue 1480 about Unreal Engine 5.1
172 */
173 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
174 mode.SliceHashingTableEnable = true;
175 mode.SliceHashingTableEnableMask = true;
176 mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
177 hashing32x32 : NormalMode);
178 mode.CrossSliceHashingModeMask = -1;
179 mode.FastClearOptimizationEnable = !device->physical->disable_fcv;
180 mode.FastClearOptimizationEnableMask = !device->physical->disable_fcv;
181 }
182 #endif
183 }
184
185 static void
state_system_mem_fence_address_emit(struct anv_device * device,struct anv_batch * batch)186 state_system_mem_fence_address_emit(struct anv_device *device, struct anv_batch *batch)
187 {
188 #if GFX_VERx10 >= 200
189 struct anv_address addr = { .bo = device->mem_fence_bo };
190 anv_batch_emit(batch, GENX(STATE_SYSTEM_MEM_FENCE_ADDRESS), mem_fence_addr) {
191 mem_fence_addr.SystemMemoryFenceAddress = addr;
192 }
193 #endif
194 }
195
196 static void
init_common_queue_state(struct anv_queue * queue,struct anv_batch * batch)197 init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
198 {
199 UNUSED struct anv_device *device = queue->device;
200
201 #if GFX_VER >= 11
202 /* Starting with GFX version 11, SLM is no longer part of the L3$ config
203 * so it never changes throughout the lifetime of the VkDevice.
204 */
205 const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
206 genX(emit_l3_config)(batch, device, cfg);
207 device->l3_config = cfg;
208 #endif
209
210 #if GFX_VERx10 == 125
211 /* Even though L3 partial write merging is supposed to be enabled
212 * by default on Gfx12.5 according to the hardware spec, i915
213 * appears to accidentally clear the enables during context
214 * initialization, so make sure to enable them here since partial
215 * write merging has a large impact on rendering performance.
216 */
217 anv_batch_write_reg(batch, GENX(L3SQCREG5), reg) {
218 reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
219 reg.CompressiblePartialWriteMergeEnable = true;
220 reg.CoherentPartialWriteMergeEnable = true;
221 reg.CrossTilePartialWriteMergeEnable = true;
222 }
223 #endif
224
225 /* Emit STATE_BASE_ADDRESS on Gfx12+ because we set a default CPS_STATE and
226 * those are relative to STATE_BASE_ADDRESS::DynamicStateBaseAddress.
227 */
228 #if GFX_VER >= 12
229
230 #if GFX_VERx10 >= 125
231 /* Wa_14016407139:
232 *
233 * "On Surface state base address modification, for 3D workloads, SW must
234 * always program PIPE_CONTROL either with CS Stall or PS sync stall. In
235 * both the cases set Render Target Cache Flush Enable".
236 */
237 genx_batch_emit_pipe_control(batch, device->info,
238 0,
239 ANV_PIPE_CS_STALL_BIT |
240 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
241 #endif
242
243 /* GEN:BUG:1607854226:
244 *
245 * Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
246 * Fortunately, we always start the context off in 3D mode.
247 */
248 uint32_t mocs = device->isl_dev.mocs.internal;
249 anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
250 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
251 sba.GeneralStateBufferSize = 0xfffff;
252 sba.GeneralStateMOCS = mocs;
253 sba.GeneralStateBaseAddressModifyEnable = true;
254 sba.GeneralStateBufferSizeModifyEnable = true;
255
256 sba.StatelessDataPortAccessMOCS = mocs;
257
258 sba.SurfaceStateBaseAddress =
259 (struct anv_address) { .offset =
260 device->physical->va.internal_surface_state_pool.addr,
261 };
262 sba.SurfaceStateMOCS = mocs;
263 sba.SurfaceStateBaseAddressModifyEnable = true;
264
265 sba.DynamicStateBaseAddress =
266 (struct anv_address) { .offset =
267 device->physical->va.dynamic_state_pool.addr,
268 };
269 sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size +
270 device->physical->va.dynamic_visible_pool.size) / 4096;
271 sba.DynamicStateMOCS = mocs;
272 sba.DynamicStateBaseAddressModifyEnable = true;
273 sba.DynamicStateBufferSizeModifyEnable = true;
274
275 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
276 sba.IndirectObjectBufferSize = 0xfffff;
277 sba.IndirectObjectMOCS = mocs;
278 sba.IndirectObjectBaseAddressModifyEnable = true;
279 sba.IndirectObjectBufferSizeModifyEnable = true;
280
281 sba.InstructionBaseAddress =
282 (struct anv_address) { .offset =
283 device->physical->va.instruction_state_pool.addr,
284 };
285 sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096;
286 sba.InstructionMOCS = mocs;
287 sba.InstructionBaseAddressModifyEnable = true;
288 sba.InstructionBuffersizeModifyEnable = true;
289
290 #if GFX_VER >= 11
291 sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
292 sba.BindlessSamplerStateBufferSize = 0;
293 sba.BindlessSamplerStateMOCS = mocs;
294 sba.BindlessSamplerStateBaseAddressModifyEnable = true;
295 #endif
296
297 if (device->physical->indirect_descriptors) {
298 sba.BindlessSurfaceStateBaseAddress =
299 (struct anv_address) { .offset =
300 device->physical->va.bindless_surface_state_pool.addr,
301 };
302 sba.BindlessSurfaceStateSize =
303 anv_physical_device_bindless_heap_size(device->physical, false) /
304 ANV_SURFACE_STATE_SIZE - 1;
305 sba.BindlessSurfaceStateMOCS = mocs;
306 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
307 } else {
308 /* Bindless Surface State & Bindless Sampler State are aligned to the
309 * same heap
310 */
311 sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
312 .offset = device->physical->va.internal_surface_state_pool.addr,
313 };
314 sba.BindlessSurfaceStateSize =
315 (device->physical->va.internal_surface_state_pool.size +
316 device->physical->va.bindless_surface_state_pool.size) - 1;
317 sba.BindlessSurfaceStateMOCS = mocs;
318 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
319 }
320
321 #if GFX_VERx10 >= 125
322 sba.L1CacheControl = L1CC_WB;
323 #endif
324 }
325
326 /* Disable the POOL_ALLOC mechanism in HW. We found that this state can get
327 * corrupted (likely due to leaking from another context), the default
328 * value should be disabled. It doesn't cost anything to set it once at
329 * device initialization.
330 */
331 #if GFX_VER >= 11 && GFX_VERx10 < 125
332 anv_batch_emit(batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
333 btpa.MOCS = mocs;
334 btpa.BindingTablePoolEnable = false;
335 }
336 #endif
337
338 struct mi_builder b;
339 mi_builder_init(&b, device->info, batch);
340
341 mi_store(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
342 mi_imm(device->physical->va.internal_surface_state_pool.addr));
343 #endif /* GFX_VER >= 12 */
344
345 #if GFX_VERx10 >= 125
346 if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
347 anv_batch_emit(batch, GENX(3DSTATE_BTD), btd) {
348 /* TODO: This is the timeout after which the bucketed thread
349 * dispatcher will kick off a wave of threads. We go with the
350 * lowest value for now. It could be tweaked on a per
351 * application basis (drirc).
352 */
353 btd.DispatchTimeoutCounter = _64clocks;
354 /* BSpec 43851: "This field must be programmed to 6h i.e. memory
355 * backed buffer must be 128KB."
356 */
357 btd.PerDSSMemoryBackedBufferSize = 6;
358 btd.MemoryBackedBufferBasePointer = (struct anv_address) {
359 /* This batch doesn't have a reloc list so we can't use the BO
360 * here. We just use the address directly.
361 */
362 .offset = device->btd_fifo_bo->offset,
363 };
364 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
365 btd.BTDMidthreadpreemption = false;
366 #endif
367 }
368 }
369 #endif
370
371 state_system_mem_fence_address_emit(device, batch);
372 }
373
374 #if GFX_VER >= 20
375 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE_FAST)
376 #else
377 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE)
378 #endif
379
380 static VkResult
init_render_queue_state(struct anv_queue * queue,bool is_companion_rcs_batch)381 init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
382 {
383 struct anv_device *device = queue->device;
384 UNUSED const struct intel_device_info *devinfo = queue->device->info;
385
386 struct anv_async_submit *submit;
387 VkResult result = anv_async_submit_create(queue,
388 &device->batch_bo_pool,
389 is_companion_rcs_batch,
390 true, &submit);
391 if (result != VK_SUCCESS)
392 return result;
393
394 struct anv_batch *batch = &submit->batch;
395
396 genX(emit_pipeline_select)(batch, _3D, device);
397
398 #if GFX_VER == 9
399 anv_batch_write_reg(batch, GENX(CACHE_MODE_1), cm1) {
400 cm1.FloatBlendOptimizationEnable = true;
401 cm1.FloatBlendOptimizationEnableMask = true;
402 cm1.MSCRAWHazardAvoidanceBit = true;
403 cm1.MSCRAWHazardAvoidanceBitMask = true;
404 cm1.PartialResolveDisableInVC = true;
405 cm1.PartialResolveDisableInVCMask = true;
406 }
407 #endif
408
409 anv_batch_emit(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
410
411 anv_batch_emit(batch, _3DSTATE_DRAWING_RECTANGLE, rect) {
412 rect.ClippedDrawingRectangleYMin = 0;
413 rect.ClippedDrawingRectangleXMin = 0;
414 rect.ClippedDrawingRectangleYMax = UINT16_MAX;
415 rect.ClippedDrawingRectangleXMax = UINT16_MAX;
416 rect.DrawingRectangleOriginY = 0;
417 rect.DrawingRectangleOriginX = 0;
418 }
419
420 anv_batch_emit(batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
421
422 /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
423 *
424 * "3DSTATE_RASTER if used must be programmed prior to using this
425 * packet."
426 *
427 * Emit this before 3DSTATE_WM_HZ_OP below.
428 */
429 anv_batch_emit(batch, GENX(3DSTATE_RASTER), rast) {
430 rast.APIMode = DX101;
431 }
432
433 /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
434 *
435 * "3DSTATE_MULTISAMPLE packet must be used prior to this packet to
436 * change the Number of Multisamples. This packet must not be used to
437 * change Number of Multisamples in a rendering sequence."
438 *
439 * Emit this before 3DSTATE_WM_HZ_OP below.
440 */
441 anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms);
442
443 /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
444 * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
445 * Clear." It mentions that the packet overrides GPU state for the clear
446 * operation and needs to be reset to 0s to clear the overrides. Depending
447 * on the kernel, we may not get a context with the state for this packet
448 * zeroed. Do it ourselves just in case. We've observed this to prevent a
449 * number of GPU hangs on ICL.
450 */
451 anv_batch_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
452
453 genX(emit_sample_pattern)(batch, NULL);
454
455 #if GFX_VER == 11
456 /* The default behavior of bit 5 "Headerless Message for Pre-emptable
457 * Contexts" in SAMPLER MODE register is set to 0, which means
458 * headerless sampler messages are not allowed for pre-emptable
459 * contexts. Set the bit 5 to 1 to allow them.
460 */
461 anv_batch_write_reg(batch, GENX(SAMPLER_MODE), sm) {
462 sm.HeaderlessMessageforPreemptableContexts = true;
463 sm.HeaderlessMessageforPreemptableContextsMask = true;
464 }
465
466 /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
467 * HALF_SLICE_CHICKEN7 register.
468 */
469 anv_batch_write_reg(batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
470 hsc7.EnabledTexelOffsetPrecisionFix = true;
471 hsc7.EnabledTexelOffsetPrecisionFixMask = true;
472 }
473
474 anv_batch_write_reg(batch, GENX(TCCNTLREG), tcc) {
475 tcc.L3DataPartialWriteMergingEnable = true;
476 tcc.ColorZPartialWriteMergingEnable = true;
477 tcc.URBPartialWriteMergingEnable = true;
478 tcc.TCDisable = true;
479 }
480 #endif
481 genX(emit_slice_hashing_state)(device, batch);
482
483 #if GFX_VER >= 11
484 /* hardware specification recommends disabling repacking for
485 * the compatibility with decompression mechanism in display controller.
486 */
487 if (device->info->disable_ccs_repack) {
488 anv_batch_write_reg(batch, GENX(CACHE_MODE_0), cm0) {
489 cm0.DisableRepackingforCompression = true;
490 cm0.DisableRepackingforCompressionMask = true;
491 }
492 }
493
494 /* an unknown issue is causing vs push constants to become
495 * corrupted during object-level preemption. For now, restrict
496 * to command buffer level preemption to avoid rendering
497 * corruption.
498 */
499 anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
500 cc1.ReplayMode = MidcmdbufferPreemption;
501 cc1.ReplayModeMask = true;
502
503 #if GFX_VERx10 == 120
504 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
505 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
506 #endif
507 }
508
509 #if INTEL_NEEDS_WA_1806527549
510 /* Wa_1806527549 says to disable the following HiZ optimization when the
511 * depth buffer is D16_UNORM. We've found the WA to help with more depth
512 * buffer configurations however, so we always disable it just to be safe.
513 */
514 anv_batch_write_reg(batch, GENX(HIZ_CHICKEN), reg) {
515 reg.HZDepthTestLEGEOptimizationDisable = true;
516 reg.HZDepthTestLEGEOptimizationDisableMask = true;
517 }
518 #endif
519
520 #if GFX_VER == 12
521 anv_batch_write_reg(batch, GENX(FF_MODE2), reg) {
522 /* On Alchemist, the FF_MODE2 docs for the GS timer say:
523 *
524 * "The timer value must be set to 224."
525 *
526 * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
527 * and that this is necessary to avoid hanging the HS/DS units. It
528 * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
529 *
530 * The HS timer docs also have the same quote for Alchemist. I am
531 * unaware of a reason it needs to be set to 224 on Tigerlake, but
532 * we do so for consistency if nothing else.
533 *
534 * For the TDS timer value, the docs say:
535 *
536 * "For best performance, a value of 4 should be programmed."
537 *
538 * i915 also sets it this way on Tigerlake due to workarounds.
539 *
540 * The default VS timer appears to be 0, so we leave it at that.
541 */
542 reg.GSTimerValue = 224;
543 reg.HSTimerValue = 224;
544 reg.TDSTimerValue = 4;
545 reg.VSTimerValue = 0;
546 }
547 #endif
548
549 #if INTEL_NEEDS_WA_1508744258
550 /* Disable RHWO by setting 0x7010[14] by default except during resolve
551 * pass.
552 *
553 * We implement global disabling of the optimization here and we toggle it
554 * in anv_image_ccs_op().
555 */
556 anv_batch_write_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
557 c1.RCCRHWOOptimizationDisable = true;
558 c1.RCCRHWOOptimizationDisableMask = true;
559 }
560 #endif
561
562 #if GFX_VERx10 < 125
563 #define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
564 #else
565 #define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
566 #endif
567
568 /* Enable the new line drawing algorithm that produces higher quality
569 * lines.
570 */
571 anv_batch_write_reg(batch, AA_LINE_QUALITY_REG, c3) {
572 c3.AALineQualityFix = true;
573 c3.AALineQualityFixMask = true;
574 }
575 #endif
576
577 #if GFX_VER == 12
578 if (device->info->has_aux_map) {
579 uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
580 assert(aux_base_addr % (32 * 1024) == 0);
581 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
582 lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
583 lri.DataDWord = aux_base_addr & 0xffffffff;
584 }
585 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
586 lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
587 lri.DataDWord = aux_base_addr >> 32;
588 }
589 }
590 #endif
591
592 #if GFX_VERx10 == 125
593 anv_batch_write_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
594 reg.TBIMRBatchSizeOverride = true;
595 reg.TBIMROpenBatchEnable = true;
596 reg.TBIMRFastClip = true;
597 reg.TBIMRBatchSizeOverrideMask = true;
598 reg.TBIMROpenBatchEnableMask = true;
599 reg.TBIMRFastClipMask = true;
600 }
601 #endif
602
603 /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
604 * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
605 *
606 * This is only safe on kernels with context isolation support.
607 */
608 assert(device->physical->info.has_context_isolation);
609 anv_batch_write_reg(batch, GENX(CS_DEBUG_MODE2), csdm2) {
610 csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
611 csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
612 }
613
614 init_common_queue_state(queue, batch);
615
616 /* Because 3DSTATE_CPS::CoarsePixelShadingStateArrayPointer is relative to
617 * the dynamic state base address we need to emit this instruction after
618 * STATE_BASE_ADDRESS in init_common_queue_state().
619 */
620 #if GFX_VER >= 30
621 anv_batch_emit(batch, GENX(3DSTATE_COARSE_PIXEL), cps);
622 #elif GFX_VER >= 12
623 anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) {
624 assert(device->cps_states.alloc_size != 0);
625 /* Offset 0 is the disabled state */
626 cps.CoarsePixelShadingStateArrayPointer =
627 device->cps_states.offset;
628 }
629 #elif GFX_VER == 11
630 anv_batch_emit(batch, GENX(3DSTATE_CPS), cps);
631 #endif
632
633 #if GFX_VERx10 >= 125
634 anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
635 cm.Mask1 = 0xffff;
636 #if GFX_VERx10 >= 200
637 cm.Mask2 = 0xffff;
638 #endif
639 }
640 anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
641 anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
642
643 /* We no longer required to explicitly flush or invalidate caches since the
644 * PIPELINE_SELECT is getting deprecated on Xe2+.
645 */
646 #if GFX_VER < 20
647 genx_batch_emit_pipe_control_write(batch, device->info, _3D, NoWrite,
648 ANV_NULL_ADDRESS,
649 0,
650 ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
651 #endif
652
653 genX(emit_pipeline_select)(batch, GPGPU, device);
654 anv_batch_emit(batch, GENX(CFE_STATE), cfe) {
655 cfe.MaximumNumberofThreads =
656 devinfo->max_cs_threads * devinfo->subslice_total;
657 }
658
659 /* We no longer required to explicitly flush or invalidate caches since the
660 * PIPELINE_SELECT is getting deprecated on Xe2+.
661 */
662 #if GFX_VER < 20
663 genx_batch_emit_pipe_control_write(batch, device->info, _3D, NoWrite,
664 ANV_NULL_ADDRESS,
665 0,
666 ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
667 #endif
668
669 genX(emit_pipeline_select)(batch, _3D, device);
670 #endif
671
672 #if GFX_VER >= 20
673 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
674 p.DX10OGLBorderModeforYCRCB = true;
675 p.DX10OGLBorderModeforYCRCBMask = true;
676 #if INTEL_NEEDS_WA_14019857787
677 p.EnableOOOreadsinRCPB = true;
678 p.EnableOOOreadsinRCPBMask = true;
679 #endif
680 }
681 #endif
682
683 anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
684
685 result = batch->status;
686 if (result != VK_SUCCESS) {
687 anv_async_submit_destroy(submit);
688 return result;
689 }
690
691 result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
692 if (result != VK_SUCCESS) {
693 anv_async_submit_destroy(submit);
694 return result;
695 }
696
697 if (is_companion_rcs_batch)
698 queue->init_companion_submit = submit;
699 else
700 queue->init_submit = submit;
701
702 return VK_SUCCESS;
703 }
704
705 static VkResult
init_compute_queue_state(struct anv_queue * queue)706 init_compute_queue_state(struct anv_queue *queue)
707 {
708 struct anv_device *device = queue->device;
709 UNUSED const struct intel_device_info *devinfo = device->info;
710 struct anv_async_submit *submit;
711 VkResult result = anv_async_submit_create(queue,
712 &device->batch_bo_pool,
713 false, true, &submit);
714 if (result != VK_SUCCESS)
715 return result;
716
717 struct anv_batch *batch = &submit->batch;
718
719 genX(emit_pipeline_select)(batch, GPGPU, queue->device);
720
721 #if GFX_VER == 12
722 if (queue->device->info->has_aux_map) {
723 uint64_t aux_base_addr =
724 intel_aux_map_get_base(queue->device->aux_map_ctx);
725 assert(aux_base_addr % (32 * 1024) == 0);
726 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
727 lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
728 lri.DataDWord = aux_base_addr & 0xffffffff;
729 }
730 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
731 lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num) + 4;
732 lri.DataDWord = aux_base_addr >> 32;
733 }
734 }
735 #else
736 assert(!queue->device->info->has_aux_map);
737 #endif
738
739 /* Wa_14015782607 - Issue pipe control with HDC_flush and
740 * untyped cache flush set to 1 when CCS has NP state update with
741 * STATE_COMPUTE_MODE.
742 */
743 if (intel_needs_workaround(devinfo, 14015782607) &&
744 queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
745 genx_batch_emit_pipe_control(batch, devinfo, GPGPU,
746 ANV_PIPE_CS_STALL_BIT |
747 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
748 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
749 }
750
751 #if GFX_VERx10 >= 125
752 /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
753 * emitting NP state commands with ATS-M in compute mode.
754 */
755 if (intel_device_info_is_atsm(devinfo) &&
756 queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
757 genx_batch_emit_pipe_control
758 (batch, devinfo, GPGPU,
759 ANV_PIPE_CS_STALL_BIT |
760 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
761 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
762 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
763 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
764 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
765 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
766 }
767
768 anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
769 #if GFX_VER >= 20
770 cm.AsyncComputeThreadLimit = ACTL_Max8;
771 cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
772 cm.ZAsyncThrottlesettings = ZATS_DefertoAsyncComputeThreadLimit;
773 cm.AsyncComputeThreadLimitMask = 0x7;
774 cm.ZPassAsyncComputeThreadLimitMask = 0x7;
775 cm.ZAsyncThrottlesettingsMask = 0x3;
776 #else
777 cm.PixelAsyncComputeThreadLimit = PACTL_Max24;
778 cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
779 cm.PixelAsyncComputeThreadLimitMask = 0x7;
780 cm.ZPassAsyncComputeThreadLimitMask = 0x7;
781 if (intel_device_info_is_mtl_or_arl(devinfo)) {
782 cm.ZAsyncThrottlesettings = ZATS_DefertoPixelAsyncComputeThreadLimit;
783 cm.ZAsyncThrottlesettingsMask = 0x3;
784 }
785 #endif
786 }
787 #endif
788
789 init_common_queue_state(queue, batch);
790
791 #if GFX_VERx10 >= 125
792 anv_batch_emit(batch, GENX(CFE_STATE), cfe) {
793 cfe.MaximumNumberofThreads =
794 devinfo->max_cs_threads * devinfo->subslice_total;
795 }
796 #endif
797
798 anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
799
800 result = batch->status;
801 if (result != VK_SUCCESS) {
802 anv_async_submit_destroy(submit);
803 return result;
804 }
805
806 result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
807 if (result != VK_SUCCESS) {
808 anv_async_submit_destroy(submit);
809 return result;
810 }
811
812 queue->init_submit = submit;
813
814 return VK_SUCCESS;
815 }
816
817 static VkResult
init_copy_video_queue_state(struct anv_queue * queue)818 init_copy_video_queue_state(struct anv_queue *queue)
819 {
820 struct anv_device *device = queue->device;
821 UNUSED const struct intel_device_info *devinfo = device->info;
822
823 struct anv_async_submit *submit;
824 VkResult result = anv_async_submit_create(queue,
825 &device->batch_bo_pool,
826 false, true, &submit);
827 if (result != VK_SUCCESS)
828 return result;
829
830 struct anv_batch *batch = &submit->batch;
831
832 #if GFX_VER >= 12
833 if (devinfo->has_aux_map) {
834 uint64_t reg = GENX(VD0_AUX_TABLE_BASE_ADDR_num);
835
836 if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY) {
837 #if GFX_VERx10 >= 125
838 reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
839 #endif
840 }
841
842 uint64_t aux_base_addr =
843 intel_aux_map_get_base(queue->device->aux_map_ctx);
844 assert(aux_base_addr % (32 * 1024) == 0);
845 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
846 lri.RegisterOffset = reg;
847 lri.DataDWord = aux_base_addr & 0xffffffff;
848 }
849 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
850 lri.RegisterOffset = reg + 4;
851 lri.DataDWord = aux_base_addr >> 32;
852 }
853 }
854 #else
855 assert(!queue->device->info->has_aux_map);
856 #endif
857
858 state_system_mem_fence_address_emit(device, batch);
859
860 if (batch->start != batch->next) {
861 anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
862
863 result = batch->status;
864 if (result != VK_SUCCESS) {
865 anv_async_submit_destroy(submit);
866 return result;
867 }
868
869 result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
870 if (result != VK_SUCCESS) {
871 anv_async_submit_destroy(submit);
872 return result;
873 }
874
875 queue->init_submit = submit;
876 } else {
877 anv_async_submit_destroy(submit);
878 }
879
880 return VK_SUCCESS;
881 }
882
883 void
genX(init_physical_device_state)884 genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
885 {
886 assert(pdevice->info.verx10 == GFX_VERx10);
887
888 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
889 #if ANV_SUPPORT_RT_GRL
890 genX(grl_load_rt_uuid)(pdevice->rt_uuid);
891 pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
892 #else
893 STATIC_ASSERT(sizeof(ANV_RT_UUID_MACRO) == VK_UUID_SIZE);
894 memcpy(pdevice->rt_uuid, ANV_RT_UUID_MACRO, VK_UUID_SIZE);
895 #endif
896 #endif
897
898 pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
899 pdevice->cmd_capture_data = genX(cmd_capture_data);
900
901 pdevice->gpgpu_pipeline_value = GPGPU;
902
903 struct GENX(VERTEX_ELEMENT_STATE) empty_ve = {
904 .Valid = true,
905 .Component0Control = VFCOMP_STORE_0,
906 .Component1Control = VFCOMP_STORE_0,
907 .Component2Control = VFCOMP_STORE_0,
908 .Component3Control = VFCOMP_STORE_0,
909 };
910 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, pdevice->empty_vs_input, &empty_ve);
911 }
912
913 VkResult
genX(init_device_state)914 genX(init_device_state)(struct anv_device *device)
915 {
916 VkResult res;
917
918 device->slice_hash = (struct anv_state) { 0 };
919 for (uint32_t i = 0; i < device->queue_count; i++) {
920 struct anv_queue *queue = &device->queues[i];
921 switch (queue->family->engine_class) {
922 case INTEL_ENGINE_CLASS_RENDER:
923 res = init_render_queue_state(queue, false /* is_companion_rcs_batch */);
924 break;
925 case INTEL_ENGINE_CLASS_COMPUTE: {
926 res = init_compute_queue_state(queue);
927 if (res != VK_SUCCESS)
928 return res;
929
930 /**
931 * Execute RCS init batch by default on the companion RCS command buffer in
932 * order to support MSAA copy/clear operations on compute queue.
933 */
934 res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
935 break;
936 }
937 case INTEL_ENGINE_CLASS_VIDEO:
938 res = init_copy_video_queue_state(queue);
939 break;
940 case INTEL_ENGINE_CLASS_COPY:
941 res = init_copy_video_queue_state(queue);
942 if (res != VK_SUCCESS)
943 return res;
944
945 /**
946 * Execute RCS init batch by default on the companion RCS command buffer in
947 * order to support MSAA copy/clear operations on copy queue.
948 */
949 res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
950 break;
951 default:
952 res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
953 break;
954 }
955 if (res != VK_SUCCESS)
956 return res;
957
958 if (!device->trtt.queue &&
959 queue->family->queueFlags & VK_QUEUE_SPARSE_BINDING_BIT)
960 device->trtt.queue = queue;
961 }
962
963 return res;
964 }
965
966 #if GFX_VERx10 >= 125
967 #define maybe_for_each_shading_rate_op(name) \
968 for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
969 name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
970 name++)
971 #elif GFX_VER >= 12
972 #define maybe_for_each_shading_rate_op(name)
973 #endif
974
975 /* Rather than reemitting the CPS_STATE structure everything those changes and
976 * for as many viewports as needed, we can just prepare all possible cases and
977 * just pick the right offset from the prepacked states when needed.
978 */
979 void
genX(init_cps_device_state)980 genX(init_cps_device_state)(struct anv_device *device)
981 {
982 #if GFX_VER >= 12 && GFX_VER < 30
983 void *cps_state_ptr = device->cps_states.map;
984
985 /* Disabled CPS mode */
986 for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
987 /* ICL PRMs, Volume 2d: Command Reference: Structures: 3DSTATE_CPS_BODY:
988 *
989 * "It is an INVALID configuration to set the CPS mode other than
990 * CPS_MODE_NONE and request per-sample dispatch in 3DSTATE_PS_EXTRA.
991 * Such configuration should be disallowed at the API level, and
992 * rendering results are undefined."
993 *
994 * Since we select this state when per coarse pixel is disabled and that
995 * includes when per-sample dispatch is enabled, we need to ensure this
996 * is set to NONE.
997 */
998 struct GENX(CPS_STATE) cps_state = {
999 .CoarsePixelShadingMode = CPS_MODE_NONE,
1000 };
1001
1002 GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
1003 cps_state_ptr += GENX(CPS_STATE_length) * 4;
1004 }
1005
1006 maybe_for_each_shading_rate_op(op0) {
1007 maybe_for_each_shading_rate_op(op1) {
1008 for (uint32_t x = 1; x <= 4; x *= 2) {
1009 for (uint32_t y = 1; y <= 4; y *= 2) {
1010 struct GENX(CPS_STATE) cps_state = {
1011 .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
1012 .MinCPSizeX = x,
1013 .MinCPSizeY = y,
1014 };
1015
1016 #if GFX_VERx10 >= 125
1017 static const uint32_t combiner_ops[] = {
1018 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR] = PASSTHROUGH,
1019 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
1020 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR] = HIGH_QUALITY,
1021 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR] = LOW_QUALITY,
1022 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR] = RELATIVE,
1023 };
1024
1025 cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
1026 cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
1027 #endif /* GFX_VERx10 >= 125 */
1028
1029 for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
1030 GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
1031 cps_state_ptr += GENX(CPS_STATE_length) * 4;
1032 }
1033 }
1034 }
1035 }
1036 }
1037 #endif /* GFX_VER >= 12 && GFX_VER < 30 */
1038 }
1039
1040 void
genX(emit_l3_config)1041 genX(emit_l3_config)(struct anv_batch *batch,
1042 const struct anv_device *device,
1043 const struct intel_l3_config *cfg)
1044 {
1045 #if GFX_VER < 20
1046 UNUSED const struct intel_device_info *devinfo = device->info;
1047
1048 #if GFX_VER >= 12
1049 #define L3_ALLOCATION_REG GENX(L3ALLOC)
1050 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
1051 #else
1052 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
1053 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
1054 #endif
1055
1056 anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
1057 if (cfg == NULL || (GFX_VER >= 12 && cfg->n[INTEL_L3P_ALL] > 126)) {
1058 assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
1059 cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
1060 cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
1061 cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
1062 #if GFX_VER >= 12
1063 l3cr.L3FullWayAllocationEnable = true;
1064 #else
1065 unreachable("Invalid L3$ config");
1066 #endif
1067 } else {
1068 #if GFX_VER < 11
1069 l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
1070 #endif
1071 #if INTEL_NEEDS_WA_1406697149
1072 /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
1073 * set in L3CNTLREG register. The default setting of the bit is not
1074 * the desirable behavior.
1075 */
1076 l3cr.ErrorDetectionBehaviorControl = true;
1077 l3cr.UseFullWays = true;
1078 #endif /* INTEL_NEEDS_WA_1406697149 */
1079 assert(cfg->n[INTEL_L3P_IS] == 0);
1080 assert(cfg->n[INTEL_L3P_C] == 0);
1081 assert(cfg->n[INTEL_L3P_T] == 0);
1082 l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
1083 l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
1084 l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
1085 l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
1086 }
1087 }
1088 #endif /* GFX_VER < 20 */
1089 }
1090
1091 void
genX(emit_sample_pattern)1092 genX(emit_sample_pattern)(struct anv_batch *batch,
1093 const struct vk_sample_locations_state *sl)
1094 {
1095 assert(sl == NULL || sl->grid_size.width == 1);
1096 assert(sl == NULL || sl->grid_size.height == 1);
1097
1098 /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
1099 * VkPhysicalDeviceFeatures::standardSampleLocations.
1100 */
1101 anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
1102 /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
1103 *
1104 * "When programming the sample offsets (for NUMSAMPLES_4 or _8
1105 * and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
1106 * (or 7 for 8X, or 15 for 16X) must have monotonically increasing
1107 * distance from the pixel center. This is required to get the
1108 * correct centroid computation in the device."
1109 *
1110 * However, the Vulkan spec seems to require that the the samples occur
1111 * in the order provided through the API. The standard sample patterns
1112 * have the above property that they have monotonically increasing
1113 * distances from the center but client-provided ones do not. As long as
1114 * this only affects centroid calculations as the docs say, we should be
1115 * ok because OpenGL and Vulkan only require that the centroid be some
1116 * lit sample and that it's the same for all samples in a pixel; they
1117 * have no requirement that it be the one closest to center.
1118 */
1119 for (uint32_t i = 1; i <= 16; i *= 2) {
1120 switch (i) {
1121 case VK_SAMPLE_COUNT_1_BIT:
1122 if (sl && sl->per_pixel == i) {
1123 INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
1124 } else {
1125 INTEL_SAMPLE_POS_1X(sp._1xSample);
1126 }
1127 break;
1128 case VK_SAMPLE_COUNT_2_BIT:
1129 if (sl && sl->per_pixel == i) {
1130 INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
1131 } else {
1132 INTEL_SAMPLE_POS_2X(sp._2xSample);
1133 }
1134 break;
1135 case VK_SAMPLE_COUNT_4_BIT:
1136 if (sl && sl->per_pixel == i) {
1137 INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
1138 } else {
1139 INTEL_SAMPLE_POS_4X(sp._4xSample);
1140 }
1141 break;
1142 case VK_SAMPLE_COUNT_8_BIT:
1143 if (sl && sl->per_pixel == i) {
1144 INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
1145 } else {
1146 INTEL_SAMPLE_POS_8X(sp._8xSample);
1147 }
1148 break;
1149 case VK_SAMPLE_COUNT_16_BIT:
1150 if (sl && sl->per_pixel == i) {
1151 INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
1152 } else {
1153 INTEL_SAMPLE_POS_16X(sp._16xSample);
1154 }
1155 break;
1156 default:
1157 unreachable("Invalid sample count");
1158 }
1159 }
1160 }
1161 }
1162
1163 static uint32_t
vk_to_intel_tex_filter(VkFilter filter,bool anisotropyEnable)1164 vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
1165 {
1166 switch (filter) {
1167 default:
1168 unreachable("Invalid filter");
1169 case VK_FILTER_NEAREST:
1170 return anisotropyEnable ?
1171 #if GFX_VER >= 30
1172 MAPFILTER_ANISOTROPIC_FAST :
1173 #else
1174 MAPFILTER_ANISOTROPIC :
1175 #endif
1176 MAPFILTER_NEAREST;
1177 case VK_FILTER_LINEAR:
1178 return anisotropyEnable ?
1179 #if GFX_VER >= 30
1180 MAPFILTER_ANISOTROPIC_FAST :
1181 #else
1182 MAPFILTER_ANISOTROPIC :
1183 #endif
1184 MAPFILTER_LINEAR;
1185 }
1186 }
1187
1188 static uint32_t
vk_to_intel_max_anisotropy(float ratio)1189 vk_to_intel_max_anisotropy(float ratio)
1190 {
1191 return (CLAMP(ratio, 2, 16) - 2) / 2;
1192 }
1193
1194 static const uint32_t vk_to_intel_mipmap_mode[] = {
1195 [VK_SAMPLER_MIPMAP_MODE_NEAREST] = MIPFILTER_NEAREST,
1196 [VK_SAMPLER_MIPMAP_MODE_LINEAR] = MIPFILTER_LINEAR
1197 };
1198
1199 static const uint32_t vk_to_intel_tex_address[] = {
1200 [VK_SAMPLER_ADDRESS_MODE_REPEAT] = TCM_WRAP,
1201 [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
1202 [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE] = TCM_CLAMP,
1203 [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
1204 [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
1205 };
1206
1207 /* Vulkan specifies the result of shadow comparisons as:
1208 * 1 if ref <op> texel,
1209 * 0 otherwise.
1210 *
1211 * The hardware does:
1212 * 0 if texel <op> ref,
1213 * 1 otherwise.
1214 *
1215 * So, these look a bit strange because there's both a negation
1216 * and swapping of the arguments involved.
1217 */
1218 static const uint32_t vk_to_intel_shadow_compare_op[] = {
1219 [VK_COMPARE_OP_NEVER] = PREFILTEROP_ALWAYS,
1220 [VK_COMPARE_OP_LESS] = PREFILTEROP_LEQUAL,
1221 [VK_COMPARE_OP_EQUAL] = PREFILTEROP_NOTEQUAL,
1222 [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LESS,
1223 [VK_COMPARE_OP_GREATER] = PREFILTEROP_GEQUAL,
1224 [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_EQUAL,
1225 [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GREATER,
1226 [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_NEVER,
1227 };
1228
1229 static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
1230 [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
1231 [VK_SAMPLER_REDUCTION_MODE_MIN] = MINIMUM,
1232 [VK_SAMPLER_REDUCTION_MODE_MAX] = MAXIMUM,
1233 };
1234
genX(CreateSampler)1235 VkResult genX(CreateSampler)(
1236 VkDevice _device,
1237 const VkSamplerCreateInfo* pCreateInfo,
1238 const VkAllocationCallbacks* pAllocator,
1239 VkSampler* pSampler)
1240 {
1241 ANV_FROM_HANDLE(anv_device, device, _device);
1242 struct anv_sampler *sampler;
1243
1244 sampler = vk_sampler_create(&device->vk, pCreateInfo,
1245 pAllocator, sizeof(*sampler));
1246 if (!sampler)
1247 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1248
1249 const struct vk_format_ycbcr_info *ycbcr_info =
1250 sampler->vk.format != VK_FORMAT_UNDEFINED ?
1251 vk_format_get_ycbcr_info(sampler->vk.format) : NULL;
1252 assert((ycbcr_info == NULL) == (sampler->vk.ycbcr_conversion == NULL));
1253
1254 sampler->n_planes = ycbcr_info ? ycbcr_info->n_planes : 1;
1255
1256 uint32_t border_color_stride = 64;
1257 uint32_t border_color_offset;
1258 void *border_color_ptr;
1259 if (sampler->vk.border_color <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
1260 border_color_offset = device->border_colors.offset +
1261 pCreateInfo->borderColor *
1262 border_color_stride;
1263 border_color_ptr = device->border_colors.map +
1264 pCreateInfo->borderColor * border_color_stride;
1265 } else {
1266 assert(vk_border_color_is_custom(sampler->vk.border_color));
1267 if (pCreateInfo->flags & VK_SAMPLER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
1268 const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
1269 vk_find_struct_const(pCreateInfo->pNext,
1270 OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
1271 if (opaque_info) {
1272 uint32_t alloc_idx = *((const uint32_t *)opaque_info->opaqueCaptureDescriptorData);
1273 sampler->custom_border_color =
1274 anv_state_reserved_array_pool_alloc_index(&device->custom_border_colors, alloc_idx);
1275 } else {
1276 sampler->custom_border_color =
1277 anv_state_reserved_array_pool_alloc(&device->custom_border_colors, true);
1278 }
1279 } else {
1280 sampler->custom_border_color =
1281 anv_state_reserved_array_pool_alloc(&device->custom_border_colors, false);
1282 }
1283 if (sampler->custom_border_color.alloc_size == 0)
1284 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1285
1286 border_color_offset = sampler->custom_border_color.offset;
1287 border_color_ptr = sampler->custom_border_color.map;
1288
1289 union isl_color_value color = { .u32 = {
1290 sampler->vk.border_color_value.uint32[0],
1291 sampler->vk.border_color_value.uint32[1],
1292 sampler->vk.border_color_value.uint32[2],
1293 sampler->vk.border_color_value.uint32[3],
1294 } };
1295
1296 const struct anv_format *format_desc =
1297 sampler->vk.format != VK_FORMAT_UNDEFINED ?
1298 anv_get_format(sampler->vk.format) : NULL;
1299
1300 if (format_desc && format_desc->n_planes == 1 &&
1301 !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
1302 const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
1303
1304 assert(!isl_format_has_int_channel(fmt_plane->isl_format));
1305 color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
1306 }
1307
1308 memcpy(border_color_ptr, color.u32, sizeof(color));
1309 }
1310
1311 const bool seamless_cube =
1312 !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
1313
1314 struct mesa_sha1 ctx;
1315 _mesa_sha1_init(&ctx);
1316
1317 for (unsigned p = 0; p < sampler->n_planes; p++) {
1318 const bool plane_has_chroma =
1319 ycbcr_info && ycbcr_info->planes[p].has_chroma;
1320 const VkFilter min_filter =
1321 plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1322 pCreateInfo->minFilter;
1323 const VkFilter mag_filter =
1324 plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1325 pCreateInfo->magFilter;
1326 const bool force_addr_rounding =
1327 device->physical->instance->force_filter_addr_rounding;
1328 const bool enable_min_filter_addr_rounding =
1329 force_addr_rounding || min_filter != VK_FILTER_NEAREST;
1330 const bool enable_mag_filter_addr_rounding =
1331 force_addr_rounding || mag_filter != VK_FILTER_NEAREST;
1332 /* From Broadwell PRM, SAMPLER_STATE:
1333 * "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
1334 */
1335 enum isl_format plane0_isl_format = sampler->vk.ycbcr_conversion ?
1336 anv_get_format(sampler->vk.format)->planes[0].isl_format :
1337 ISL_FORMAT_UNSUPPORTED;
1338 const bool isl_format_is_planar_yuv =
1339 plane0_isl_format != ISL_FORMAT_UNSUPPORTED &&
1340 isl_format_is_yuv(plane0_isl_format) &&
1341 isl_format_is_planar(plane0_isl_format);
1342
1343 const uint32_t mip_filter_mode =
1344 isl_format_is_planar_yuv ?
1345 MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
1346
1347 struct GENX(SAMPLER_STATE) sampler_state = {
1348 .SamplerDisable = false,
1349 .TextureBorderColorMode = DX10OGL,
1350
1351 #if GFX_VER >= 11
1352 .CPSLODCompensationEnable = true,
1353 #endif
1354
1355 .LODPreClampMode = CLAMP_MODE_OGL,
1356
1357 .MipModeFilter = mip_filter_mode,
1358 .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
1359 .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
1360 .TextureLODBias = CLAMP(pCreateInfo->mipLodBias, -16, 15.996),
1361 .AnisotropicAlgorithm =
1362 pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
1363 .MinLOD = CLAMP(pCreateInfo->minLod, 0, 14),
1364 .MaxLOD = CLAMP(pCreateInfo->maxLod, 0, 14),
1365 .ChromaKeyEnable = 0,
1366 .ChromaKeyIndex = 0,
1367 .ChromaKeyMode = 0,
1368 .ShadowFunction =
1369 vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
1370 pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
1371 .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
1372
1373 .LODClampMagnificationMode = MIPNONE,
1374
1375 .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
1376 .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1377 .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1378 .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1379 .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1380 .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1381 .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1382 .TrilinearFilterQuality = 0,
1383 .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
1384 .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
1385 .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
1386 .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
1387
1388 .ReductionType =
1389 vk_to_intel_sampler_reduction_mode[sampler->vk.reduction_mode],
1390 .ReductionTypeEnable =
1391 sampler->vk.reduction_mode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE,
1392 };
1393
1394 /* Pack a version of the SAMPLER_STATE without the border color. We'll
1395 * use it to store into the shader cache and also for hashing.
1396 */
1397 GENX(SAMPLER_STATE_pack)(NULL, sampler->state_no_bc[p], &sampler_state);
1398 _mesa_sha1_update(&ctx, sampler->state_no_bc[p], sizeof(sampler->state_no_bc[p]));
1399
1400 /* Put border color after the hashing, we don't want the allocation
1401 * order of border colors to influence the hash. We just need th
1402 * parameters to be hashed.
1403 */
1404 sampler_state.BorderColorPointer = border_color_offset;
1405 GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
1406 }
1407
1408 /* If we have bindless, allocate enough samplers. We allocate 32 bytes
1409 * for each sampler instead of 16 bytes because we want all bindless
1410 * samplers to be 32-byte aligned so we don't have to use indirect
1411 * sampler messages on them.
1412 */
1413 sampler->bindless_state =
1414 anv_state_pool_alloc(&device->dynamic_state_pool,
1415 sampler->n_planes * 32, 32);
1416 if (sampler->bindless_state.map) {
1417 memcpy(sampler->bindless_state.map, sampler->state,
1418 sampler->n_planes * GENX(SAMPLER_STATE_length) * 4);
1419 }
1420
1421 /* Hash the border color */
1422 _mesa_sha1_update(&ctx, border_color_ptr,
1423 sizeof(union isl_color_value));
1424
1425 _mesa_sha1_final(&ctx, sampler->sha1);
1426
1427 *pSampler = anv_sampler_to_handle(sampler);
1428
1429 return VK_SUCCESS;
1430 }
1431
1432 void
genX(emit_embedded_sampler)1433 genX(emit_embedded_sampler)(struct anv_device *device,
1434 struct anv_embedded_sampler *sampler,
1435 struct anv_pipeline_embedded_sampler_binding *binding)
1436 {
1437 sampler->ref_cnt = 1;
1438 memcpy(&sampler->key, &binding->key, sizeof(binding->key));
1439
1440 sampler->border_color_state =
1441 anv_state_pool_alloc(&device->dynamic_state_pool,
1442 sizeof(struct gfx8_border_color), 64);
1443 memcpy(sampler->border_color_state.map,
1444 binding->key.color,
1445 sizeof(binding->key.color));
1446
1447 sampler->sampler_state =
1448 anv_state_pool_alloc(&device->dynamic_state_pool,
1449 ANV_SAMPLER_STATE_SIZE, 32);
1450
1451 struct GENX(SAMPLER_STATE) sampler_state = {
1452 .BorderColorPointer = sampler->border_color_state.offset,
1453 };
1454 uint32_t dwords[GENX(SAMPLER_STATE_length)];
1455 GENX(SAMPLER_STATE_pack)(NULL, dwords, &sampler_state);
1456
1457 for (uint32_t i = 0; i < GENX(SAMPLER_STATE_length); i++) {
1458 ((uint32_t *)sampler->sampler_state.map)[i] =
1459 dwords[i] | binding->key.sampler[i];
1460 }
1461 }
1462
1463 /* Wa_14015814527
1464 *
1465 * Check if task shader was utilized within cmd_buffer, if so
1466 * commit empty URB states and null prim.
1467 */
1468 void
genX(apply_task_urb_workaround)1469 genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
1470 {
1471 if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
1472 return;
1473
1474 #if GFX_VERx10 >= 125
1475 const struct intel_device_info *devinfo = &cmd_buffer->device->physical->info;
1476
1477 if (!intel_needs_workaround(devinfo, 16014390852))
1478 return;
1479
1480 if (cmd_buffer->state.current_pipeline != _3D ||
1481 !cmd_buffer->state.gfx.used_task_shader)
1482 return;
1483
1484 cmd_buffer->state.gfx.used_task_shader = false;
1485
1486 /* Wa_14015821291 mentions that WA below is not required if we have
1487 * a pipeline flush going on. It will get flushed during
1488 * cmd_buffer_flush_state before draw.
1489 */
1490 if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_CS_STALL_BIT))
1491 return;
1492
1493 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
1494 #if GFX_VER >= 12
1495 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
1496 urb._3DCommandSubOpcode += i;
1497 }
1498 #else
1499 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
1500 urb._3DCommandSubOpcode += i;
1501 }
1502 #endif
1503 }
1504
1505 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
1506 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
1507
1508 /* Issue 'nullprim' to commit the state. */
1509 genx_batch_emit_pipe_control_write
1510 (&cmd_buffer->batch, cmd_buffer->device->info,
1511 cmd_buffer->state.current_pipeline,
1512 WriteImmediateData, cmd_buffer->device->workaround_address, 0, 0);
1513 #endif
1514 }
1515
1516 VkResult
genX(init_trtt_context_state)1517 genX(init_trtt_context_state)(struct anv_async_submit *submit)
1518 {
1519 #if GFX_VER >= 12
1520 struct anv_queue *queue = submit->queue;
1521 struct anv_device *device = queue->device;
1522 struct anv_trtt *trtt = &device->trtt;
1523 struct anv_batch *batch = &submit->batch;
1524
1525 assert((trtt->l3_addr & 0xFFF) == 0);
1526 uint32_t l3_addr_low = (trtt->l3_addr & 0xFFFFF000) >> 12;
1527 uint32_t l3_addr_high = (trtt->l3_addr >> 32) & 0xFFFF;
1528
1529 anv_batch_write_reg(batch, GENX(GFX_TRTT_INVAL), trtt_inval)
1530 trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1531 anv_batch_write_reg(batch, GENX(GFX_TRTT_NULL), trtt_null)
1532 trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1533 anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low)
1534 trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1535 anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_HIGH), trtt_base_high)
1536 trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1537
1538 anv_batch_write_reg(batch, GENX(BLT_TRTT_INVAL), trtt_inval)
1539 trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1540 anv_batch_write_reg(batch, GENX(BLT_TRTT_NULL), trtt_null)
1541 trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1542 anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_LOW), trtt_base_low)
1543 trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1544 anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_HIGH), trtt_base_high)
1545 trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1546
1547 anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_INVAL), trtt_inval)
1548 trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1549 anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_NULL), trtt_null)
1550 trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1551 anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_LOW), trtt_base_low)
1552 trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1553 anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_HIGH), trtt_base_high)
1554 trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1555
1556 #if GFX_VER >= 20
1557 uint32_t trva_base = device->physical->va.trtt.addr >> 44;
1558 anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range)
1559 trtt_va_range.TRVABase = trva_base;
1560 anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range)
1561 trtt_va_range.TRVABase = trva_base;
1562 anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range)
1563 trtt_va_range.TRVABase = trva_base;
1564 #else
1565 anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
1566 trtt_va_range.TRVAMaskValue = 0xF;
1567 trtt_va_range.TRVADataValue = 0xF;
1568 }
1569 anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range) {
1570 trtt_va_range.TRVAMaskValue = 0xF;
1571 trtt_va_range.TRVADataValue = 0xF;
1572 }
1573 anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range) {
1574 trtt_va_range.TRVAMaskValue = 0xF;
1575 trtt_va_range.TRVADataValue = 0xF;
1576 }
1577 #endif
1578
1579 /* Enabling TR-TT needs to be done after setting up the other registers.
1580 */
1581 anv_batch_write_reg(batch, GENX(GFX_TRTT_CR), trtt_cr)
1582 trtt_cr.TRTTEnable = true;
1583 anv_batch_write_reg(batch, GENX(BLT_TRTT_CR), trtt_cr)
1584 trtt_cr.TRTTEnable = true;
1585 anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_CR), trtt_cr)
1586 trtt_cr.TRTTEnable = true;
1587
1588 if (queue->family->engine_class != INTEL_ENGINE_CLASS_COPY) {
1589 genx_batch_emit_pipe_control(batch, device->info, _3D,
1590 ANV_PIPE_CS_STALL_BIT |
1591 ANV_PIPE_TLB_INVALIDATE_BIT);
1592 }
1593 #endif
1594 return VK_SUCCESS;
1595 }
1596