• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 
26 #include "common/intel_aux_map.h"
27 #include "common/intel_sample_positions.h"
28 #include "common/intel_pixel_hash.h"
29 #include "genxml/gen_macros.h"
30 #include "genxml/genX_pack.h"
31 
32 #include "vk_standard_sample_locations.h"
33 
34 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT_GRL
35 #include "grl/genX_grl.h"
36 #endif
37 
38 #include "genX_mi_builder.h"
39 
40 #include "vk_util.h"
41 #include "vk_format.h"
42 
43 static void
genX(emit_slice_hashing_state)44 genX(emit_slice_hashing_state)(struct anv_device *device,
45                                struct anv_batch *batch)
46 {
47 #if GFX_VER == 11
48    /* Gfx11 hardware has two pixel pipes at most. */
49    for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++)
50       assert(device->info->ppipe_subslices[i] == 0);
51 
52    if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1])
53      return;
54 
55    if (!device->slice_hash.alloc_size) {
56       unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
57       device->slice_hash =
58          anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
59 
60       const bool flip = device->info->ppipe_subslices[0] <
61                      device->info->ppipe_subslices[1];
62       struct GENX(SLICE_HASH_TABLE) table;
63       intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
64 
65       GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
66    }
67 
68    anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
69       ptr.SliceHashStatePointerValid = true;
70       ptr.SliceHashTableStatePointer = device->slice_hash.offset;
71    }
72 
73    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
74       mode.SliceHashingTableEnable = true;
75    }
76 #elif GFX_VERx10 == 120
77    /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
78     * present with n active dual subslices.
79     */
80    unsigned ppipes_of[3] = {};
81 
82    for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
83       for (unsigned p = 0; p < 3; p++)
84          ppipes_of[n] += (device->info->ppipe_subslices[p] == n);
85    }
86 
87    /* Gfx12 has three pixel pipes. */
88    for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++)
89       assert(device->info->ppipe_subslices[p] == 0);
90 
91    if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
92       /* All three pixel pipes have the maximum number of active dual
93        * subslices, or there is only one active pixel pipe: Nothing to do.
94        */
95       return;
96    }
97 
98    anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
99       p.SliceHashControl[0] = TABLE_0;
100 
101       if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
102          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
103       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
104          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
105 
106       if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
107          intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
108       else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
109          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
110       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
111          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
112       else
113          unreachable("Illegal fusing.");
114    }
115 
116    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
117       p.SubsliceHashingTableEnable = true;
118       p.SubsliceHashingTableEnableMask = true;
119    }
120 #elif GFX_VERx10 == 125
121    /* Calculate the set of present pixel pipes, and another set of
122     * present pixel pipes with 2 dual subslices enabled, the latter
123     * will appear on the hashing table with twice the frequency of
124     * pixel pipes with a single dual subslice present.
125     */
126    uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
127    for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
128       if (device->info->ppipe_subslices[p] > 0)
129          ppipe_mask1 |= (1u << p);
130       if (device->info->ppipe_subslices[p] > 1)
131          ppipe_mask2 |= (1u << p);
132    }
133    assert(ppipe_mask1);
134 
135    if (!device->slice_hash.alloc_size) {
136       unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
137       device->slice_hash =
138          anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
139 
140       struct GENX(SLICE_HASH_TABLE) table;
141 
142       /* Note that the hardware expects an array with 7 tables, each
143        * table is intended to specify the pixel pipe hashing behavior
144        * for every possible slice count between 2 and 8, however that
145        * doesn't actually work, among other reasons due to hardware
146        * bugs that will cause the GPU to erroneously access the table
147        * at the wrong index in some cases, so in practice all 7 tables
148        * need to be initialized to the same value.
149        */
150       for (unsigned i = 0; i < 7; i++)
151          intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
152                                              table.Entry[i][0]);
153 
154       GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
155    }
156 
157    anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
158       ptr.SliceHashStatePointerValid = true;
159       ptr.SliceHashTableStatePointer = device->slice_hash.offset;
160    }
161 
162    /* TODO: Figure out FCV support for other platforms
163     * Testing indicates that FCV is broken gfx125.
164     * Let's disable FCV for now till we figure out what's wrong.
165     *
166     * Alternatively, it can be toggled off via drirc option 'anv_disable_fcv'.
167     *
168     * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9987
169     * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10318
170     * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10795
171     * Ref: Internal issue 1480 about Unreal Engine 5.1
172     */
173    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
174       mode.SliceHashingTableEnable = true;
175       mode.SliceHashingTableEnableMask = true;
176       mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
177 				    hashing32x32 : NormalMode);
178       mode.CrossSliceHashingModeMask = -1;
179       mode.FastClearOptimizationEnable = !device->physical->disable_fcv;
180       mode.FastClearOptimizationEnableMask = !device->physical->disable_fcv;
181    }
182 #endif
183 }
184 
185 static void
state_system_mem_fence_address_emit(struct anv_device * device,struct anv_batch * batch)186 state_system_mem_fence_address_emit(struct anv_device *device, struct anv_batch *batch)
187 {
188 #if GFX_VERx10 >= 200
189    struct anv_address addr = { .bo = device->mem_fence_bo };
190    anv_batch_emit(batch, GENX(STATE_SYSTEM_MEM_FENCE_ADDRESS), mem_fence_addr) {
191       mem_fence_addr.SystemMemoryFenceAddress = addr;
192    }
193 #endif
194 }
195 
196 static void
init_common_queue_state(struct anv_queue * queue,struct anv_batch * batch)197 init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
198 {
199    UNUSED struct anv_device *device = queue->device;
200 
201 #if GFX_VER >= 11
202    /* Starting with GFX version 11, SLM is no longer part of the L3$ config
203     * so it never changes throughout the lifetime of the VkDevice.
204     */
205    const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
206    genX(emit_l3_config)(batch, device, cfg);
207    device->l3_config = cfg;
208 #endif
209 
210 #if GFX_VERx10 == 125
211    /* Even though L3 partial write merging is supposed to be enabled
212     * by default on Gfx12.5 according to the hardware spec, i915
213     * appears to accidentally clear the enables during context
214     * initialization, so make sure to enable them here since partial
215     * write merging has a large impact on rendering performance.
216     */
217    anv_batch_write_reg(batch, GENX(L3SQCREG5), reg) {
218       reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
219       reg.CompressiblePartialWriteMergeEnable = true;
220       reg.CoherentPartialWriteMergeEnable = true;
221       reg.CrossTilePartialWriteMergeEnable = true;
222    }
223 #endif
224 
225    /* Emit STATE_BASE_ADDRESS on Gfx12+ because we set a default CPS_STATE and
226     * those are relative to STATE_BASE_ADDRESS::DynamicStateBaseAddress.
227     */
228 #if GFX_VER >= 12
229 
230 #if GFX_VERx10 >= 125
231    /* Wa_14016407139:
232     *
233     * "On Surface state base address modification, for 3D workloads, SW must
234     *  always program PIPE_CONTROL either with CS Stall or PS sync stall. In
235     *  both the cases set Render Target Cache Flush Enable".
236     */
237    genx_batch_emit_pipe_control(batch, device->info,
238                                 0,
239                                 ANV_PIPE_CS_STALL_BIT |
240                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
241 #endif
242 
243    /* GEN:BUG:1607854226:
244     *
245     *  Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
246     *  Fortunately, we always start the context off in 3D mode.
247     */
248    uint32_t mocs = device->isl_dev.mocs.internal;
249    anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
250       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
251       sba.GeneralStateBufferSize  = 0xfffff;
252       sba.GeneralStateMOCS = mocs;
253       sba.GeneralStateBaseAddressModifyEnable = true;
254       sba.GeneralStateBufferSizeModifyEnable = true;
255 
256       sba.StatelessDataPortAccessMOCS = mocs;
257 
258       sba.SurfaceStateBaseAddress =
259          (struct anv_address) { .offset =
260          device->physical->va.internal_surface_state_pool.addr,
261       };
262       sba.SurfaceStateMOCS = mocs;
263       sba.SurfaceStateBaseAddressModifyEnable = true;
264 
265       sba.DynamicStateBaseAddress =
266          (struct anv_address) { .offset =
267          device->physical->va.dynamic_state_pool.addr,
268       };
269       sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size +
270                                     device->physical->va.dynamic_visible_pool.size) / 4096;
271       sba.DynamicStateMOCS = mocs;
272       sba.DynamicStateBaseAddressModifyEnable = true;
273       sba.DynamicStateBufferSizeModifyEnable = true;
274 
275       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
276       sba.IndirectObjectBufferSize = 0xfffff;
277       sba.IndirectObjectMOCS = mocs;
278       sba.IndirectObjectBaseAddressModifyEnable = true;
279       sba.IndirectObjectBufferSizeModifyEnable = true;
280 
281       sba.InstructionBaseAddress =
282          (struct anv_address) { .offset =
283          device->physical->va.instruction_state_pool.addr,
284       };
285       sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096;
286       sba.InstructionMOCS = mocs;
287       sba.InstructionBaseAddressModifyEnable = true;
288       sba.InstructionBuffersizeModifyEnable = true;
289 
290 #if GFX_VER >= 11
291       sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
292       sba.BindlessSamplerStateBufferSize = 0;
293       sba.BindlessSamplerStateMOCS = mocs;
294       sba.BindlessSamplerStateBaseAddressModifyEnable = true;
295 #endif
296 
297       if (device->physical->indirect_descriptors) {
298          sba.BindlessSurfaceStateBaseAddress =
299             (struct anv_address) { .offset =
300             device->physical->va.bindless_surface_state_pool.addr,
301          };
302          sba.BindlessSurfaceStateSize =
303             anv_physical_device_bindless_heap_size(device->physical, false) /
304             ANV_SURFACE_STATE_SIZE - 1;
305          sba.BindlessSurfaceStateMOCS = mocs;
306          sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
307       } else {
308          /* Bindless Surface State & Bindless Sampler State are aligned to the
309           * same heap
310           */
311          sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
312             .offset = device->physical->va.internal_surface_state_pool.addr,
313          };
314          sba.BindlessSurfaceStateSize =
315             (device->physical->va.internal_surface_state_pool.size +
316              device->physical->va.bindless_surface_state_pool.size) - 1;
317          sba.BindlessSurfaceStateMOCS = mocs;
318          sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
319       }
320 
321 #if GFX_VERx10 >= 125
322       sba.L1CacheControl = L1CC_WB;
323 #endif
324    }
325 
326    /* Disable the POOL_ALLOC mechanism in HW. We found that this state can get
327     * corrupted (likely due to leaking from another context), the default
328     * value should be disabled. It doesn't cost anything to set it once at
329     * device initialization.
330     */
331 #if GFX_VER >= 11 && GFX_VERx10 < 125
332    anv_batch_emit(batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
333       btpa.MOCS = mocs;
334       btpa.BindingTablePoolEnable = false;
335    }
336 #endif
337 
338    struct mi_builder b;
339    mi_builder_init(&b, device->info, batch);
340 
341    mi_store(&b, mi_reg64(ANV_BINDLESS_SURFACE_BASE_ADDR_REG),
342                 mi_imm(device->physical->va.internal_surface_state_pool.addr));
343 #endif /* GFX_VER >= 12 */
344 
345 #if GFX_VERx10 >= 125
346    if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
347       anv_batch_emit(batch, GENX(3DSTATE_BTD), btd) {
348          /* TODO: This is the timeout after which the bucketed thread
349           *       dispatcher will kick off a wave of threads. We go with the
350           *       lowest value for now. It could be tweaked on a per
351           *       application basis (drirc).
352           */
353          btd.DispatchTimeoutCounter = _64clocks;
354          /* BSpec 43851: "This field must be programmed to 6h i.e. memory
355           *               backed buffer must be 128KB."
356           */
357          btd.PerDSSMemoryBackedBufferSize = 6;
358          btd.MemoryBackedBufferBasePointer = (struct anv_address) {
359             /* This batch doesn't have a reloc list so we can't use the BO
360              * here.  We just use the address directly.
361              */
362             .offset = device->btd_fifo_bo->offset,
363          };
364 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
365          btd.BTDMidthreadpreemption = false;
366 #endif
367       }
368    }
369 #endif
370 
371    state_system_mem_fence_address_emit(device, batch);
372 }
373 
374 #if GFX_VER >= 20
375 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE_FAST)
376 #else
377 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE)
378 #endif
379 
380 static VkResult
init_render_queue_state(struct anv_queue * queue,bool is_companion_rcs_batch)381 init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
382 {
383    struct anv_device *device = queue->device;
384    UNUSED const struct intel_device_info *devinfo = queue->device->info;
385 
386    struct anv_async_submit *submit;
387    VkResult result = anv_async_submit_create(queue,
388                                              &device->batch_bo_pool,
389                                              is_companion_rcs_batch,
390                                              true, &submit);
391    if (result != VK_SUCCESS)
392       return result;
393 
394    struct anv_batch *batch = &submit->batch;
395 
396    genX(emit_pipeline_select)(batch, _3D, device);
397 
398 #if GFX_VER == 9
399    anv_batch_write_reg(batch, GENX(CACHE_MODE_1), cm1) {
400       cm1.FloatBlendOptimizationEnable = true;
401       cm1.FloatBlendOptimizationEnableMask = true;
402       cm1.MSCRAWHazardAvoidanceBit = true;
403       cm1.MSCRAWHazardAvoidanceBitMask = true;
404       cm1.PartialResolveDisableInVC = true;
405       cm1.PartialResolveDisableInVCMask = true;
406    }
407 #endif
408 
409    anv_batch_emit(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
410 
411    anv_batch_emit(batch, _3DSTATE_DRAWING_RECTANGLE, rect) {
412       rect.ClippedDrawingRectangleYMin = 0;
413       rect.ClippedDrawingRectangleXMin = 0;
414       rect.ClippedDrawingRectangleYMax = UINT16_MAX;
415       rect.ClippedDrawingRectangleXMax = UINT16_MAX;
416       rect.DrawingRectangleOriginY = 0;
417       rect.DrawingRectangleOriginX = 0;
418    }
419 
420    anv_batch_emit(batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
421 
422    /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
423     *
424     *   "3DSTATE_RASTER if used must be programmed prior to using this
425     *    packet."
426     *
427     * Emit this before 3DSTATE_WM_HZ_OP below.
428     */
429    anv_batch_emit(batch, GENX(3DSTATE_RASTER), rast) {
430       rast.APIMode = DX101;
431    }
432 
433    /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
434     *
435     *    "3DSTATE_MULTISAMPLE packet must be used prior to this packet to
436     *     change the Number of Multisamples. This packet must not be used to
437     *     change Number of Multisamples in a rendering sequence."
438     *
439     * Emit this before 3DSTATE_WM_HZ_OP below.
440     */
441    anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms);
442 
443    /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
444     * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
445     * Clear." It mentions that the packet overrides GPU state for the clear
446     * operation and needs to be reset to 0s to clear the overrides. Depending
447     * on the kernel, we may not get a context with the state for this packet
448     * zeroed. Do it ourselves just in case. We've observed this to prevent a
449     * number of GPU hangs on ICL.
450     */
451    anv_batch_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
452 
453    genX(emit_sample_pattern)(batch, NULL);
454 
455 #if GFX_VER == 11
456    /* The default behavior of bit 5 "Headerless Message for Pre-emptable
457     * Contexts" in SAMPLER MODE register is set to 0, which means
458     * headerless sampler messages are not allowed for pre-emptable
459     * contexts. Set the bit 5 to 1 to allow them.
460     */
461    anv_batch_write_reg(batch, GENX(SAMPLER_MODE), sm) {
462       sm.HeaderlessMessageforPreemptableContexts = true;
463       sm.HeaderlessMessageforPreemptableContextsMask = true;
464    }
465 
466    /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
467     * HALF_SLICE_CHICKEN7 register.
468     */
469    anv_batch_write_reg(batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
470       hsc7.EnabledTexelOffsetPrecisionFix = true;
471       hsc7.EnabledTexelOffsetPrecisionFixMask = true;
472    }
473 
474    anv_batch_write_reg(batch, GENX(TCCNTLREG), tcc) {
475       tcc.L3DataPartialWriteMergingEnable = true;
476       tcc.ColorZPartialWriteMergingEnable = true;
477       tcc.URBPartialWriteMergingEnable = true;
478       tcc.TCDisable = true;
479    }
480 #endif
481    genX(emit_slice_hashing_state)(device, batch);
482 
483 #if GFX_VER >= 11
484    /* hardware specification recommends disabling repacking for
485     * the compatibility with decompression mechanism in display controller.
486     */
487    if (device->info->disable_ccs_repack) {
488       anv_batch_write_reg(batch, GENX(CACHE_MODE_0), cm0) {
489          cm0.DisableRepackingforCompression = true;
490          cm0.DisableRepackingforCompressionMask = true;
491       }
492    }
493 
494    /* an unknown issue is causing vs push constants to become
495     * corrupted during object-level preemption. For now, restrict
496     * to command buffer level preemption to avoid rendering
497     * corruption.
498     */
499    anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
500       cc1.ReplayMode = MidcmdbufferPreemption;
501       cc1.ReplayModeMask = true;
502 
503 #if GFX_VERx10 == 120
504       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
505       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
506 #endif
507    }
508 
509 #if INTEL_NEEDS_WA_1806527549
510    /* Wa_1806527549 says to disable the following HiZ optimization when the
511     * depth buffer is D16_UNORM. We've found the WA to help with more depth
512     * buffer configurations however, so we always disable it just to be safe.
513     */
514    anv_batch_write_reg(batch, GENX(HIZ_CHICKEN), reg) {
515       reg.HZDepthTestLEGEOptimizationDisable = true;
516       reg.HZDepthTestLEGEOptimizationDisableMask = true;
517    }
518 #endif
519 
520 #if GFX_VER == 12
521    anv_batch_write_reg(batch, GENX(FF_MODE2), reg) {
522       /* On Alchemist, the FF_MODE2 docs for the GS timer say:
523        *
524        *    "The timer value must be set to 224."
525        *
526        * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
527        * and that this is necessary to avoid hanging the HS/DS units.  It
528        * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
529        *
530        * The HS timer docs also have the same quote for Alchemist.  I am
531        * unaware of a reason it needs to be set to 224 on Tigerlake, but
532        * we do so for consistency if nothing else.
533        *
534        * For the TDS timer value, the docs say:
535        *
536        *    "For best performance, a value of 4 should be programmed."
537        *
538        * i915 also sets it this way on Tigerlake due to workarounds.
539        *
540        * The default VS timer appears to be 0, so we leave it at that.
541        */
542       reg.GSTimerValue  = 224;
543       reg.HSTimerValue  = 224;
544       reg.TDSTimerValue = 4;
545       reg.VSTimerValue  = 0;
546    }
547 #endif
548 
549 #if INTEL_NEEDS_WA_1508744258
550    /*    Disable RHWO by setting 0x7010[14] by default except during resolve
551     *    pass.
552     *
553     * We implement global disabling of the optimization here and we toggle it
554     * in anv_image_ccs_op().
555     */
556    anv_batch_write_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
557       c1.RCCRHWOOptimizationDisable = true;
558       c1.RCCRHWOOptimizationDisableMask = true;
559    }
560 #endif
561 
562 #if GFX_VERx10 < 125
563 #define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
564 #else
565 #define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
566 #endif
567 
568    /* Enable the new line drawing algorithm that produces higher quality
569     * lines.
570     */
571    anv_batch_write_reg(batch, AA_LINE_QUALITY_REG, c3) {
572       c3.AALineQualityFix = true;
573       c3.AALineQualityFixMask = true;
574    }
575 #endif
576 
577 #if GFX_VER == 12
578    if (device->info->has_aux_map) {
579       uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
580       assert(aux_base_addr % (32 * 1024) == 0);
581       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
582          lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
583          lri.DataDWord = aux_base_addr & 0xffffffff;
584       }
585       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
586          lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
587          lri.DataDWord = aux_base_addr >> 32;
588       }
589    }
590 #endif
591 
592 #if GFX_VERx10 == 125
593    anv_batch_write_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
594       reg.TBIMRBatchSizeOverride = true;
595       reg.TBIMROpenBatchEnable = true;
596       reg.TBIMRFastClip = true;
597       reg.TBIMRBatchSizeOverrideMask = true;
598       reg.TBIMROpenBatchEnableMask = true;
599       reg.TBIMRFastClipMask = true;
600    }
601 #endif
602 
603    /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
604     * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
605     *
606     * This is only safe on kernels with context isolation support.
607     */
608    assert(device->physical->info.has_context_isolation);
609    anv_batch_write_reg(batch, GENX(CS_DEBUG_MODE2), csdm2) {
610       csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
611       csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
612    }
613 
614    init_common_queue_state(queue, batch);
615 
616    /* Because 3DSTATE_CPS::CoarsePixelShadingStateArrayPointer is relative to
617     * the dynamic state base address we need to emit this instruction after
618     * STATE_BASE_ADDRESS in init_common_queue_state().
619     */
620 #if GFX_VER >= 30
621    anv_batch_emit(batch, GENX(3DSTATE_COARSE_PIXEL), cps);
622 #elif GFX_VER >= 12
623    anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) {
624       assert(device->cps_states.alloc_size != 0);
625       /* Offset 0 is the disabled state */
626       cps.CoarsePixelShadingStateArrayPointer =
627          device->cps_states.offset;
628    }
629 #elif GFX_VER == 11
630    anv_batch_emit(batch, GENX(3DSTATE_CPS), cps);
631 #endif
632 
633 #if GFX_VERx10 >= 125
634    anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
635 #if GFX_VER >= 30
636       cm.EnableVariableRegisterSizeAllocation = true;
637 #endif
638       cm.Mask1 = 0xffff;
639 #if GFX_VERx10 >= 200
640       cm.Mask2 = 0xffff;
641 #endif
642    }
643    anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
644    anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
645 
646    /* We no longer required to explicitly flush or invalidate caches since the
647     * PIPELINE_SELECT is getting deprecated on Xe2+.
648     */
649 #if GFX_VER < 20
650    genx_batch_emit_pipe_control_write(batch, device->info, _3D, NoWrite,
651                                       ANV_NULL_ADDRESS,
652                                       0,
653                                       ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
654 #endif
655 
656    genX(emit_pipeline_select)(batch, GPGPU, device);
657    anv_batch_emit(batch, GENX(CFE_STATE), cfe) {
658       cfe.MaximumNumberofThreads =
659          devinfo->max_cs_threads * devinfo->subslice_total;
660    }
661 
662    /* We no longer required to explicitly flush or invalidate caches since the
663     * PIPELINE_SELECT is getting deprecated on Xe2+.
664     */
665 #if GFX_VER < 20
666    genx_batch_emit_pipe_control_write(batch, device->info, _3D, NoWrite,
667                                       ANV_NULL_ADDRESS,
668                                       0,
669                                       ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
670 #endif
671 
672    genX(emit_pipeline_select)(batch, _3D, device);
673 #endif
674 
675 #if GFX_VER >= 20
676    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
677       p.DX10OGLBorderModeforYCRCB = true;
678       p.DX10OGLBorderModeforYCRCBMask = true;
679 #if INTEL_NEEDS_WA_14019857787
680       p.EnableOOOreadsinRCPB = true;
681       p.EnableOOOreadsinRCPBMask = true;
682 #endif
683    }
684 #endif
685 
686    anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
687 
688    result = batch->status;
689    if (result != VK_SUCCESS) {
690       anv_async_submit_destroy(submit);
691       return result;
692    }
693 
694    result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
695    if (result != VK_SUCCESS) {
696       anv_async_submit_destroy(submit);
697       return result;
698    }
699 
700    if (is_companion_rcs_batch)
701       queue->init_companion_submit = submit;
702    else
703       queue->init_submit = submit;
704 
705    return VK_SUCCESS;
706 }
707 
708 static VkResult
init_compute_queue_state(struct anv_queue * queue)709 init_compute_queue_state(struct anv_queue *queue)
710 {
711    struct anv_device *device = queue->device;
712    UNUSED const struct intel_device_info *devinfo = device->info;
713    struct anv_async_submit *submit;
714    VkResult result = anv_async_submit_create(queue,
715                                              &device->batch_bo_pool,
716                                              false, true, &submit);
717    if (result != VK_SUCCESS)
718       return result;
719 
720    struct anv_batch *batch = &submit->batch;
721 
722    genX(emit_pipeline_select)(batch, GPGPU, queue->device);
723 
724 #if GFX_VER == 12
725    if (queue->device->info->has_aux_map) {
726       uint64_t aux_base_addr =
727          intel_aux_map_get_base(queue->device->aux_map_ctx);
728       assert(aux_base_addr % (32 * 1024) == 0);
729       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
730          lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
731          lri.DataDWord = aux_base_addr & 0xffffffff;
732       }
733       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
734          lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num) + 4;
735          lri.DataDWord = aux_base_addr >> 32;
736       }
737    }
738 #else
739    assert(!queue->device->info->has_aux_map);
740 #endif
741 
742    /* Wa_14015782607 - Issue pipe control with HDC_flush and
743     * untyped cache flush set to 1 when CCS has NP state update with
744     * STATE_COMPUTE_MODE.
745     */
746    if (intel_needs_workaround(devinfo, 14015782607) &&
747        queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
748       genx_batch_emit_pipe_control(batch, devinfo, GPGPU,
749                                    ANV_PIPE_CS_STALL_BIT |
750                                    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
751                                    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
752    }
753 
754 #if GFX_VERx10 >= 125
755    /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
756     * emitting NP state commands with ATS-M in compute mode.
757     */
758    if (intel_device_info_is_atsm(devinfo) &&
759        queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
760       genx_batch_emit_pipe_control
761          (batch, devinfo, GPGPU,
762           ANV_PIPE_CS_STALL_BIT |
763           ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
764           ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
765           ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
766           ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
767           ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
768           ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
769    }
770 
771    anv_batch_emit(batch, GENX(STATE_COMPUTE_MODE), cm) {
772 #if GFX_VER >= 30
773       cm.EnableVariableRegisterSizeAllocationMask = 1;
774       cm.EnableVariableRegisterSizeAllocation = true;
775 #endif
776 #if GFX_VER >= 20
777       cm.AsyncComputeThreadLimit = ACTL_Max8;
778       cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
779       cm.ZAsyncThrottlesettings = ZATS_DefertoAsyncComputeThreadLimit;
780       cm.AsyncComputeThreadLimitMask = 0x7;
781       cm.ZPassAsyncComputeThreadLimitMask = 0x7;
782       cm.ZAsyncThrottlesettingsMask = 0x3;
783 #else
784       cm.PixelAsyncComputeThreadLimit = PACTL_Max24;
785       cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
786       cm.PixelAsyncComputeThreadLimitMask = 0x7;
787       cm.ZPassAsyncComputeThreadLimitMask = 0x7;
788       if (intel_device_info_is_mtl_or_arl(devinfo)) {
789          cm.ZAsyncThrottlesettings = ZATS_DefertoPixelAsyncComputeThreadLimit;
790          cm.ZAsyncThrottlesettingsMask = 0x3;
791       }
792 #endif
793    }
794 #endif
795 
796    init_common_queue_state(queue, batch);
797 
798 #if GFX_VERx10 >= 125
799    anv_batch_emit(batch, GENX(CFE_STATE), cfe) {
800       cfe.MaximumNumberofThreads =
801          devinfo->max_cs_threads * devinfo->subslice_total;
802    }
803 #endif
804 
805    anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
806 
807    result = batch->status;
808    if (result != VK_SUCCESS) {
809       anv_async_submit_destroy(submit);
810       return result;
811    }
812 
813    result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
814    if (result != VK_SUCCESS) {
815       anv_async_submit_destroy(submit);
816       return result;
817    }
818 
819    queue->init_submit = submit;
820 
821    return VK_SUCCESS;
822 }
823 
824 static VkResult
init_copy_video_queue_state(struct anv_queue * queue)825 init_copy_video_queue_state(struct anv_queue *queue)
826 {
827    struct anv_device *device = queue->device;
828    UNUSED const struct intel_device_info *devinfo = device->info;
829 
830    struct anv_async_submit *submit;
831    VkResult result = anv_async_submit_create(queue,
832                                              &device->batch_bo_pool,
833                                              false, true, &submit);
834    if (result != VK_SUCCESS)
835       return result;
836 
837    struct anv_batch *batch = &submit->batch;
838 
839 #if GFX_VER >= 12
840    if (devinfo->has_aux_map) {
841       uint64_t reg = GENX(VD0_AUX_TABLE_BASE_ADDR_num);
842 
843       if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY) {
844 #if GFX_VERx10 >= 125
845          reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
846 #endif
847       }
848 
849       uint64_t aux_base_addr =
850          intel_aux_map_get_base(queue->device->aux_map_ctx);
851       assert(aux_base_addr % (32 * 1024) == 0);
852       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
853          lri.RegisterOffset = reg;
854          lri.DataDWord = aux_base_addr & 0xffffffff;
855       }
856       anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
857          lri.RegisterOffset = reg + 4;
858          lri.DataDWord = aux_base_addr >> 32;
859       }
860    }
861 #else
862    assert(!queue->device->info->has_aux_map);
863 #endif
864 
865    state_system_mem_fence_address_emit(device, batch);
866 
867    if (batch->start != batch->next) {
868       anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_END), bbe);
869 
870       result = batch->status;
871       if (result != VK_SUCCESS) {
872          anv_async_submit_destroy(submit);
873          return result;
874       }
875 
876       result = device->kmd_backend->queue_exec_async(submit, 0, NULL, 0, NULL);
877       if (result != VK_SUCCESS) {
878          anv_async_submit_destroy(submit);
879          return result;
880       }
881 
882       queue->init_submit = submit;
883    } else {
884       anv_async_submit_destroy(submit);
885    }
886 
887    return VK_SUCCESS;
888 }
889 
890 void
genX(init_physical_device_state)891 genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
892 {
893    assert(pdevice->info.verx10 == GFX_VERx10);
894 
895 #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT
896 #if ANV_SUPPORT_RT_GRL
897    genX(grl_load_rt_uuid)(pdevice->rt_uuid);
898    pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
899 #else
900    STATIC_ASSERT(sizeof(ANV_RT_UUID_MACRO) == VK_UUID_SIZE);
901    memcpy(pdevice->rt_uuid, ANV_RT_UUID_MACRO, VK_UUID_SIZE);
902 #endif
903 #endif
904 
905    pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
906    pdevice->cmd_capture_data = genX(cmd_capture_data);
907 
908    pdevice->gpgpu_pipeline_value = GPGPU;
909 
910    struct GENX(VERTEX_ELEMENT_STATE) empty_ve = {
911       .Valid = true,
912       .Component0Control = VFCOMP_STORE_0,
913       .Component1Control = VFCOMP_STORE_0,
914       .Component2Control = VFCOMP_STORE_0,
915       .Component3Control = VFCOMP_STORE_0,
916    };
917    GENX(VERTEX_ELEMENT_STATE_pack)(NULL, pdevice->empty_vs_input, &empty_ve);
918 }
919 
920 VkResult
genX(init_device_state)921 genX(init_device_state)(struct anv_device *device)
922 {
923    VkResult res;
924 
925    device->slice_hash = (struct anv_state) { 0 };
926    for (uint32_t i = 0; i < device->queue_count; i++) {
927       struct anv_queue *queue = &device->queues[i];
928       switch (queue->family->engine_class) {
929       case INTEL_ENGINE_CLASS_RENDER:
930          res = init_render_queue_state(queue, false /* is_companion_rcs_batch */);
931          break;
932       case INTEL_ENGINE_CLASS_COMPUTE: {
933          res = init_compute_queue_state(queue);
934          if (res != VK_SUCCESS)
935             return res;
936 
937          /**
938           * Execute RCS init batch by default on the companion RCS command buffer in
939           * order to support MSAA copy/clear operations on compute queue.
940           */
941          res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
942          break;
943       }
944       case INTEL_ENGINE_CLASS_VIDEO:
945          res = init_copy_video_queue_state(queue);
946          break;
947       case INTEL_ENGINE_CLASS_COPY:
948          res = init_copy_video_queue_state(queue);
949          if (res != VK_SUCCESS)
950             return res;
951 
952          /**
953           * Execute RCS init batch by default on the companion RCS command buffer in
954           * order to support MSAA copy/clear operations on copy queue.
955           */
956          res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
957          break;
958       default:
959          res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
960          break;
961       }
962       if (res != VK_SUCCESS)
963          return res;
964 
965       if (!device->trtt.queue &&
966           queue->family->queueFlags & VK_QUEUE_SPARSE_BINDING_BIT)
967          device->trtt.queue = queue;
968    }
969 
970    return res;
971 }
972 
973 #if GFX_VERx10 >= 125
974 #define maybe_for_each_shading_rate_op(name) \
975    for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
976         name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
977         name++)
978 #elif GFX_VER >= 12
979 #define maybe_for_each_shading_rate_op(name)
980 #endif
981 
982 /* Rather than reemitting the CPS_STATE structure everything those changes and
983  * for as many viewports as needed, we can just prepare all possible cases and
984  * just pick the right offset from the prepacked states when needed.
985  */
986 void
genX(init_cps_device_state)987 genX(init_cps_device_state)(struct anv_device *device)
988 {
989 #if GFX_VER >= 12 && GFX_VER < 30
990    void *cps_state_ptr = device->cps_states.map;
991 
992    /* Disabled CPS mode */
993    for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
994       /* ICL PRMs, Volume 2d: Command Reference: Structures: 3DSTATE_CPS_BODY:
995        *
996        *   "It is an INVALID configuration to set the CPS mode other than
997        *    CPS_MODE_NONE and request per-sample dispatch in 3DSTATE_PS_EXTRA.
998        *    Such configuration should be disallowed at the API level, and
999        *    rendering results are undefined."
1000        *
1001        * Since we select this state when per coarse pixel is disabled and that
1002        * includes when per-sample dispatch is enabled, we need to ensure this
1003        * is set to NONE.
1004        */
1005       struct GENX(CPS_STATE) cps_state = {
1006          .CoarsePixelShadingMode = CPS_MODE_NONE,
1007       };
1008 
1009       GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
1010       cps_state_ptr += GENX(CPS_STATE_length) * 4;
1011    }
1012 
1013    maybe_for_each_shading_rate_op(op0) {
1014       maybe_for_each_shading_rate_op(op1) {
1015          for (uint32_t x = 1; x <= 4; x *= 2) {
1016             for (uint32_t y = 1; y <= 4; y *= 2) {
1017                struct GENX(CPS_STATE) cps_state = {
1018                   .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
1019                   .MinCPSizeX = x,
1020                   .MinCPSizeY = y,
1021                };
1022 
1023 #if GFX_VERx10 >= 125
1024                static const uint32_t combiner_ops[] = {
1025                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR]    = PASSTHROUGH,
1026                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
1027                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR]     = HIGH_QUALITY,
1028                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR]     = LOW_QUALITY,
1029                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR]     = RELATIVE,
1030                };
1031 
1032                cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
1033                cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
1034 #endif /* GFX_VERx10 >= 125 */
1035 
1036                for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
1037                   GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
1038                   cps_state_ptr += GENX(CPS_STATE_length) * 4;
1039                }
1040             }
1041          }
1042       }
1043    }
1044 #endif /* GFX_VER >= 12 && GFX_VER < 30 */
1045 }
1046 
1047 void
genX(emit_l3_config)1048 genX(emit_l3_config)(struct anv_batch *batch,
1049                      const struct anv_device *device,
1050                      const struct intel_l3_config *cfg)
1051 {
1052 #if GFX_VER < 20
1053    UNUSED const struct intel_device_info *devinfo = device->info;
1054 
1055 #if GFX_VER >= 12
1056 #define L3_ALLOCATION_REG GENX(L3ALLOC)
1057 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
1058 #else
1059 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
1060 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
1061 #endif
1062 
1063    anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
1064       if (cfg == NULL || (GFX_VER >= 12 && cfg->n[INTEL_L3P_ALL] > 126)) {
1065          assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
1066                           cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
1067                           cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
1068                           cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
1069 #if GFX_VER >= 12
1070          l3cr.L3FullWayAllocationEnable = true;
1071 #else
1072          unreachable("Invalid L3$ config");
1073 #endif
1074       } else {
1075 #if GFX_VER < 11
1076          l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
1077 #endif
1078 #if INTEL_NEEDS_WA_1406697149
1079          /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
1080           * set in L3CNTLREG register. The default setting of the bit is not
1081           * the desirable behavior.
1082           */
1083          l3cr.ErrorDetectionBehaviorControl = true;
1084          l3cr.UseFullWays = true;
1085 #endif /* INTEL_NEEDS_WA_1406697149 */
1086          assert(cfg->n[INTEL_L3P_IS] == 0);
1087          assert(cfg->n[INTEL_L3P_C] == 0);
1088          assert(cfg->n[INTEL_L3P_T] == 0);
1089          l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
1090          l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
1091          l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
1092          l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
1093       }
1094    }
1095 #endif /* GFX_VER < 20 */
1096 }
1097 
1098 void
genX(emit_sample_pattern)1099 genX(emit_sample_pattern)(struct anv_batch *batch,
1100                           const struct vk_sample_locations_state *sl)
1101 {
1102    assert(sl == NULL || sl->grid_size.width == 1);
1103    assert(sl == NULL || sl->grid_size.height == 1);
1104 
1105    /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
1106     * VkPhysicalDeviceFeatures::standardSampleLocations.
1107     */
1108    anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
1109       /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
1110        *
1111        *    "When programming the sample offsets (for NUMSAMPLES_4 or _8
1112        *    and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
1113        *    (or 7 for 8X, or 15 for 16X) must have monotonically increasing
1114        *    distance from the pixel center. This is required to get the
1115        *    correct centroid computation in the device."
1116        *
1117        * However, the Vulkan spec seems to require that the the samples occur
1118        * in the order provided through the API. The standard sample patterns
1119        * have the above property that they have monotonically increasing
1120        * distances from the center but client-provided ones do not. As long as
1121        * this only affects centroid calculations as the docs say, we should be
1122        * ok because OpenGL and Vulkan only require that the centroid be some
1123        * lit sample and that it's the same for all samples in a pixel; they
1124        * have no requirement that it be the one closest to center.
1125        */
1126       for (uint32_t i = 1; i <= 16; i *= 2) {
1127          switch (i) {
1128          case VK_SAMPLE_COUNT_1_BIT:
1129             if (sl && sl->per_pixel == i) {
1130                INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
1131             } else {
1132                INTEL_SAMPLE_POS_1X(sp._1xSample);
1133             }
1134             break;
1135          case VK_SAMPLE_COUNT_2_BIT:
1136             if (sl && sl->per_pixel == i) {
1137                INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
1138             } else {
1139                INTEL_SAMPLE_POS_2X(sp._2xSample);
1140             }
1141             break;
1142          case VK_SAMPLE_COUNT_4_BIT:
1143             if (sl && sl->per_pixel == i) {
1144                INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
1145             } else {
1146                INTEL_SAMPLE_POS_4X(sp._4xSample);
1147             }
1148             break;
1149          case VK_SAMPLE_COUNT_8_BIT:
1150             if (sl && sl->per_pixel == i) {
1151                INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
1152             } else {
1153                INTEL_SAMPLE_POS_8X(sp._8xSample);
1154             }
1155             break;
1156          case VK_SAMPLE_COUNT_16_BIT:
1157             if (sl && sl->per_pixel == i) {
1158                INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
1159             } else {
1160                INTEL_SAMPLE_POS_16X(sp._16xSample);
1161             }
1162             break;
1163          default:
1164             unreachable("Invalid sample count");
1165          }
1166       }
1167    }
1168 }
1169 
1170 static uint32_t
vk_to_intel_tex_filter(VkFilter filter,bool anisotropyEnable)1171 vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
1172 {
1173    switch (filter) {
1174    default:
1175       unreachable("Invalid filter");
1176    case VK_FILTER_NEAREST:
1177       return anisotropyEnable ?
1178 #if GFX_VER >= 30
1179              MAPFILTER_ANISOTROPIC_FAST :
1180 #else
1181              MAPFILTER_ANISOTROPIC :
1182 #endif
1183              MAPFILTER_NEAREST;
1184    case VK_FILTER_LINEAR:
1185       return anisotropyEnable ?
1186 #if GFX_VER >= 30
1187              MAPFILTER_ANISOTROPIC_FAST :
1188 #else
1189              MAPFILTER_ANISOTROPIC :
1190 #endif
1191              MAPFILTER_LINEAR;
1192    }
1193 }
1194 
1195 static uint32_t
vk_to_intel_max_anisotropy(float ratio)1196 vk_to_intel_max_anisotropy(float ratio)
1197 {
1198    return (CLAMP(ratio, 2, 16) - 2) / 2;
1199 }
1200 
1201 static const uint32_t vk_to_intel_mipmap_mode[] = {
1202    [VK_SAMPLER_MIPMAP_MODE_NEAREST]          = MIPFILTER_NEAREST,
1203    [VK_SAMPLER_MIPMAP_MODE_LINEAR]           = MIPFILTER_LINEAR
1204 };
1205 
1206 static const uint32_t vk_to_intel_tex_address[] = {
1207    [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = TCM_WRAP,
1208    [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
1209    [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = TCM_CLAMP,
1210    [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
1211    [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
1212 };
1213 
1214 /* Vulkan specifies the result of shadow comparisons as:
1215  *     1     if   ref <op> texel,
1216  *     0     otherwise.
1217  *
1218  * The hardware does:
1219  *     0     if texel <op> ref,
1220  *     1     otherwise.
1221  *
1222  * So, these look a bit strange because there's both a negation
1223  * and swapping of the arguments involved.
1224  */
1225 static const uint32_t vk_to_intel_shadow_compare_op[] = {
1226    [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_ALWAYS,
1227    [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LEQUAL,
1228    [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_NOTEQUAL,
1229    [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LESS,
1230    [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GEQUAL,
1231    [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_EQUAL,
1232    [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GREATER,
1233    [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_NEVER,
1234 };
1235 
1236 static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
1237    [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
1238    [VK_SAMPLER_REDUCTION_MODE_MIN]              = MINIMUM,
1239    [VK_SAMPLER_REDUCTION_MODE_MAX]              = MAXIMUM,
1240 };
1241 
genX(CreateSampler)1242 VkResult genX(CreateSampler)(
1243     VkDevice                                    _device,
1244     const VkSamplerCreateInfo*                  pCreateInfo,
1245     const VkAllocationCallbacks*                pAllocator,
1246     VkSampler*                                  pSampler)
1247 {
1248    ANV_FROM_HANDLE(anv_device, device, _device);
1249    struct anv_sampler *sampler;
1250 
1251    sampler = vk_sampler_create(&device->vk, pCreateInfo,
1252                                pAllocator, sizeof(*sampler));
1253    if (!sampler)
1254       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1255 
1256    const struct vk_format_ycbcr_info *ycbcr_info =
1257       sampler->vk.format != VK_FORMAT_UNDEFINED ?
1258       vk_format_get_ycbcr_info(sampler->vk.format) : NULL;
1259    assert((ycbcr_info == NULL) == (sampler->vk.ycbcr_conversion == NULL));
1260 
1261    sampler->n_planes = ycbcr_info ? ycbcr_info->n_planes : 1;
1262 
1263    uint32_t border_color_stride = 64;
1264    uint32_t border_color_offset;
1265    void *border_color_ptr;
1266    if (sampler->vk.border_color <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
1267       border_color_offset = device->border_colors.offset +
1268                             pCreateInfo->borderColor *
1269                             border_color_stride;
1270       border_color_ptr = device->border_colors.map +
1271                          pCreateInfo->borderColor * border_color_stride;
1272    } else {
1273       assert(vk_border_color_is_custom(sampler->vk.border_color));
1274       if (pCreateInfo->flags & VK_SAMPLER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
1275          const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
1276             vk_find_struct_const(pCreateInfo->pNext,
1277                                  OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
1278          if (opaque_info) {
1279             uint32_t alloc_idx = *((const uint32_t *)opaque_info->opaqueCaptureDescriptorData);
1280             sampler->custom_border_color =
1281                anv_state_reserved_array_pool_alloc_index(&device->custom_border_colors, alloc_idx);
1282          } else {
1283             sampler->custom_border_color =
1284                anv_state_reserved_array_pool_alloc(&device->custom_border_colors, true);
1285          }
1286       } else {
1287          sampler->custom_border_color =
1288             anv_state_reserved_array_pool_alloc(&device->custom_border_colors, false);
1289       }
1290       if (sampler->custom_border_color.alloc_size == 0)
1291          return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1292 
1293       border_color_offset = sampler->custom_border_color.offset;
1294       border_color_ptr = sampler->custom_border_color.map;
1295 
1296       union isl_color_value color = { .u32 = {
1297          sampler->vk.border_color_value.uint32[0],
1298          sampler->vk.border_color_value.uint32[1],
1299          sampler->vk.border_color_value.uint32[2],
1300          sampler->vk.border_color_value.uint32[3],
1301       } };
1302 
1303       const struct anv_format *format_desc =
1304          sampler->vk.format != VK_FORMAT_UNDEFINED ?
1305          anv_get_format(device->physical, sampler->vk.format) : NULL;
1306 
1307       if (format_desc && format_desc->n_planes == 1 &&
1308           !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
1309          const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
1310 
1311          assert(!isl_format_has_int_channel(fmt_plane->isl_format));
1312          color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
1313       }
1314 
1315       memcpy(border_color_ptr, color.u32, sizeof(color));
1316    }
1317 
1318    const bool seamless_cube =
1319       !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
1320 
1321    struct mesa_sha1 ctx;
1322    _mesa_sha1_init(&ctx);
1323 
1324    for (unsigned p = 0; p < sampler->n_planes; p++) {
1325       const bool plane_has_chroma =
1326          ycbcr_info && ycbcr_info->planes[p].has_chroma;
1327       const VkFilter min_filter =
1328          plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1329                             pCreateInfo->minFilter;
1330       const VkFilter mag_filter =
1331          plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1332                             pCreateInfo->magFilter;
1333       const bool force_addr_rounding =
1334             device->physical->instance->force_filter_addr_rounding;
1335       const bool enable_min_filter_addr_rounding =
1336             force_addr_rounding || min_filter != VK_FILTER_NEAREST;
1337       const bool enable_mag_filter_addr_rounding =
1338             force_addr_rounding || mag_filter != VK_FILTER_NEAREST;
1339       /* From Broadwell PRM, SAMPLER_STATE:
1340        *   "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
1341        */
1342       enum isl_format plane0_isl_format = sampler->vk.ycbcr_conversion ?
1343          anv_get_format(device->physical, sampler->vk.format)->planes[0].isl_format :
1344          ISL_FORMAT_UNSUPPORTED;
1345       const bool isl_format_is_planar_yuv =
1346          plane0_isl_format != ISL_FORMAT_UNSUPPORTED &&
1347          isl_format_is_yuv(plane0_isl_format) &&
1348          isl_format_is_planar(plane0_isl_format);
1349 
1350       const uint32_t mip_filter_mode =
1351          isl_format_is_planar_yuv ?
1352          MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
1353 
1354       struct GENX(SAMPLER_STATE) sampler_state = {
1355          .SamplerDisable = false,
1356          .TextureBorderColorMode = DX10OGL,
1357 
1358 #if GFX_VER >= 11
1359          .CPSLODCompensationEnable = true,
1360 #endif
1361 
1362          .LODPreClampMode = CLAMP_MODE_OGL,
1363 
1364          .MipModeFilter = mip_filter_mode,
1365          .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
1366          .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
1367          .TextureLODBias = CLAMP(pCreateInfo->mipLodBias, -16, 15.996),
1368          .AnisotropicAlgorithm =
1369             pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
1370          .MinLOD = CLAMP(pCreateInfo->minLod, 0, 14),
1371          .MaxLOD = CLAMP(pCreateInfo->maxLod, 0, 14),
1372          .ChromaKeyEnable = 0,
1373          .ChromaKeyIndex = 0,
1374          .ChromaKeyMode = 0,
1375          .ShadowFunction =
1376             vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
1377                                         pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
1378          .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
1379 
1380          .LODClampMagnificationMode = MIPNONE,
1381 
1382          .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
1383          .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1384          .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1385          .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1386          .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1387          .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1388          .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1389          .TrilinearFilterQuality = 0,
1390          .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
1391          .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
1392          .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
1393          .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
1394 
1395          .ReductionType =
1396             vk_to_intel_sampler_reduction_mode[sampler->vk.reduction_mode],
1397          .ReductionTypeEnable =
1398             sampler->vk.reduction_mode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE,
1399       };
1400 
1401       /* Pack a version of the SAMPLER_STATE without the border color. We'll
1402        * use it to store into the shader cache and also for hashing.
1403        */
1404       GENX(SAMPLER_STATE_pack)(NULL, sampler->state_no_bc[p], &sampler_state);
1405       _mesa_sha1_update(&ctx, sampler->state_no_bc[p], sizeof(sampler->state_no_bc[p]));
1406 
1407       /* Put border color after the hashing, we don't want the allocation
1408        * order of border colors to influence the hash. We just need th
1409        * parameters to be hashed.
1410        */
1411       sampler_state.BorderColorPointer = border_color_offset;
1412       GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
1413    }
1414 
1415    /* If we have bindless, allocate enough samplers.  We allocate 32 bytes
1416     * for each sampler instead of 16 bytes because we want all bindless
1417     * samplers to be 32-byte aligned so we don't have to use indirect
1418     * sampler messages on them.
1419     */
1420    sampler->bindless_state =
1421       anv_state_pool_alloc(&device->dynamic_state_pool,
1422                            sampler->n_planes * 32, 32);
1423    if (sampler->bindless_state.map) {
1424       memcpy(sampler->bindless_state.map, sampler->state,
1425              sampler->n_planes * GENX(SAMPLER_STATE_length) * 4);
1426    }
1427 
1428    /* Hash the border color */
1429    _mesa_sha1_update(&ctx, border_color_ptr,
1430                      sizeof(union isl_color_value));
1431 
1432    _mesa_sha1_final(&ctx, sampler->sha1);
1433 
1434    *pSampler = anv_sampler_to_handle(sampler);
1435 
1436    return VK_SUCCESS;
1437 }
1438 
1439 void
genX(emit_embedded_sampler)1440 genX(emit_embedded_sampler)(struct anv_device *device,
1441                             struct anv_embedded_sampler *sampler,
1442                             struct anv_pipeline_embedded_sampler_binding *binding)
1443 {
1444    sampler->ref_cnt = 1;
1445    memcpy(&sampler->key, &binding->key, sizeof(binding->key));
1446 
1447    sampler->border_color_state =
1448       anv_state_pool_alloc(&device->dynamic_state_pool,
1449                            sizeof(struct gfx8_border_color), 64);
1450    memcpy(sampler->border_color_state.map,
1451           binding->key.color,
1452           sizeof(binding->key.color));
1453 
1454    sampler->sampler_state =
1455       anv_state_pool_alloc(&device->dynamic_state_pool,
1456                            ANV_SAMPLER_STATE_SIZE, 32);
1457 
1458    struct GENX(SAMPLER_STATE) sampler_state = {
1459       .BorderColorPointer = sampler->border_color_state.offset,
1460    };
1461    uint32_t dwords[GENX(SAMPLER_STATE_length)];
1462    GENX(SAMPLER_STATE_pack)(NULL, dwords, &sampler_state);
1463 
1464    for (uint32_t i = 0; i < GENX(SAMPLER_STATE_length); i++) {
1465       ((uint32_t *)sampler->sampler_state.map)[i] =
1466          dwords[i] | binding->key.sampler[i];
1467    }
1468 }
1469 
1470 /* Wa_14015814527
1471  *
1472  * Check if task shader was utilized within cmd_buffer, if so
1473  * commit empty URB states and null prim.
1474  */
1475 void
genX(apply_task_urb_workaround)1476 genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
1477 {
1478    if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
1479       return;
1480 
1481 #if GFX_VERx10 >= 125
1482    const struct intel_device_info *devinfo = &cmd_buffer->device->physical->info;
1483 
1484    if (!intel_needs_workaround(devinfo, 16014390852))
1485       return;
1486 
1487    if (cmd_buffer->state.current_pipeline != _3D ||
1488        !cmd_buffer->state.gfx.used_task_shader)
1489       return;
1490 
1491    cmd_buffer->state.gfx.used_task_shader = false;
1492 
1493    /* Wa_14015821291 mentions that WA below is not required if we have
1494     * a pipeline flush going on. It will get flushed during
1495     * cmd_buffer_flush_state before draw.
1496     */
1497    if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_CS_STALL_BIT))
1498       return;
1499 
1500    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
1501 #if GFX_VER >= 12
1502       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
1503          urb._3DCommandSubOpcode += i;
1504       }
1505 #else
1506       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
1507          urb._3DCommandSubOpcode += i;
1508       }
1509 #endif
1510    }
1511 
1512    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
1513    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
1514 
1515    /* Issue 'nullprim' to commit the state. */
1516    genx_batch_emit_pipe_control_write
1517       (&cmd_buffer->batch, cmd_buffer->device->info,
1518        cmd_buffer->state.current_pipeline,
1519        WriteImmediateData, cmd_buffer->device->workaround_address, 0, 0);
1520 #endif
1521 }
1522 
1523 VkResult
genX(init_trtt_context_state)1524 genX(init_trtt_context_state)(struct anv_async_submit *submit)
1525 {
1526 #if GFX_VER >= 12
1527    struct anv_queue *queue = submit->queue;
1528    struct anv_device *device = queue->device;
1529    struct anv_trtt *trtt = &device->trtt;
1530    struct anv_batch *batch = &submit->batch;
1531 
1532    assert((trtt->l3_addr & 0xFFF) == 0);
1533    uint32_t l3_addr_low = (trtt->l3_addr & 0xFFFFF000) >> 12;
1534    uint32_t l3_addr_high = (trtt->l3_addr >> 32) & 0xFFFF;
1535 
1536    anv_batch_write_reg(batch, GENX(GFX_TRTT_INVAL), trtt_inval)
1537       trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1538    anv_batch_write_reg(batch, GENX(GFX_TRTT_NULL), trtt_null)
1539       trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1540    anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low)
1541       trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1542    anv_batch_write_reg(batch, GENX(GFX_TRTT_L3_BASE_HIGH), trtt_base_high)
1543       trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1544 
1545    anv_batch_write_reg(batch, GENX(BLT_TRTT_INVAL), trtt_inval)
1546       trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1547    anv_batch_write_reg(batch, GENX(BLT_TRTT_NULL), trtt_null)
1548       trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1549    anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_LOW), trtt_base_low)
1550       trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1551    anv_batch_write_reg(batch, GENX(BLT_TRTT_L3_BASE_HIGH), trtt_base_high)
1552       trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1553 
1554    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_INVAL), trtt_inval)
1555       trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1556    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_NULL), trtt_null)
1557       trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1558    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_LOW), trtt_base_low)
1559       trtt_base_low.TRVAL3PointerLowerAddress = l3_addr_low;
1560    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_L3_BASE_HIGH), trtt_base_high)
1561       trtt_base_high.TRVAL3PointerUpperAddress = l3_addr_high;
1562 
1563 #if GFX_VER >= 20
1564    uint32_t trva_base = device->physical->va.trtt.addr >> 44;
1565    anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range)
1566       trtt_va_range.TRVABase = trva_base;
1567    anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range)
1568       trtt_va_range.TRVABase = trva_base;
1569    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range)
1570       trtt_va_range.TRVABase = trva_base;
1571 #else
1572    anv_batch_write_reg(batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
1573       trtt_va_range.TRVAMaskValue = 0xF;
1574       trtt_va_range.TRVADataValue = 0xF;
1575    }
1576    anv_batch_write_reg(batch, GENX(BLT_TRTT_VA_RANGE), trtt_va_range) {
1577       trtt_va_range.TRVAMaskValue = 0xF;
1578       trtt_va_range.TRVADataValue = 0xF;
1579    }
1580    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_VA_RANGE), trtt_va_range) {
1581       trtt_va_range.TRVAMaskValue = 0xF;
1582       trtt_va_range.TRVADataValue = 0xF;
1583    }
1584 #endif
1585 
1586    /* Enabling TR-TT needs to be done after setting up the other registers.
1587     */
1588    anv_batch_write_reg(batch, GENX(GFX_TRTT_CR), trtt_cr)
1589       trtt_cr.TRTTEnable = true;
1590    anv_batch_write_reg(batch, GENX(BLT_TRTT_CR), trtt_cr)
1591       trtt_cr.TRTTEnable = true;
1592    anv_batch_write_reg(batch, GENX(COMP_CTX0_TRTT_CR), trtt_cr)
1593       trtt_cr.TRTTEnable = true;
1594 
1595    if (queue->family->engine_class != INTEL_ENGINE_CLASS_COPY) {
1596       genx_batch_emit_pipe_control(batch, device->info, _3D,
1597                                    ANV_PIPE_CS_STALL_BIT |
1598                                    ANV_PIPE_TLB_INVALIDATE_BIT);
1599    }
1600 #endif
1601    return VK_SUCCESS;
1602 }
1603