• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 
26 #include "common/intel_aux_map.h"
27 #include "common/intel_sample_positions.h"
28 #include "common/intel_pixel_hash.h"
29 #include "genxml/gen_macros.h"
30 #include "genxml/genX_pack.h"
31 
32 #include "vk_standard_sample_locations.h"
33 
34 #if GFX_VERx10 == 125 && ANV_SUPPORT_RT
35 #include "grl/genX_grl.h"
36 #endif
37 
38 #include "vk_util.h"
39 #include "vk_format.h"
40 
41 static void
genX(emit_slice_hashing_state)42 genX(emit_slice_hashing_state)(struct anv_device *device,
43                                struct anv_batch *batch)
44 {
45 #if GFX_VER == 11
46    /* Gfx11 hardware has two pixel pipes at most. */
47    for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++)
48       assert(device->info->ppipe_subslices[i] == 0);
49 
50    if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1])
51      return;
52 
53    if (!device->slice_hash.alloc_size) {
54       unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
55       device->slice_hash =
56          anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
57 
58       const bool flip = device->info->ppipe_subslices[0] <
59                      device->info->ppipe_subslices[1];
60       struct GENX(SLICE_HASH_TABLE) table;
61       intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
62 
63       GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
64    }
65 
66    anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
67       ptr.SliceHashStatePointerValid = true;
68       ptr.SliceHashTableStatePointer = device->slice_hash.offset;
69    }
70 
71    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
72       mode.SliceHashingTableEnable = true;
73    }
74 #elif GFX_VERx10 == 120
75    /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
76     * present with n active dual subslices.
77     */
78    unsigned ppipes_of[3] = {};
79 
80    for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
81       for (unsigned p = 0; p < 3; p++)
82          ppipes_of[n] += (device->info->ppipe_subslices[p] == n);
83    }
84 
85    /* Gfx12 has three pixel pipes. */
86    for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++)
87       assert(device->info->ppipe_subslices[p] == 0);
88 
89    if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
90       /* All three pixel pipes have the maximum number of active dual
91        * subslices, or there is only one active pixel pipe: Nothing to do.
92        */
93       return;
94    }
95 
96    anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
97       p.SliceHashControl[0] = TABLE_0;
98 
99       if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
100          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
101       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
102          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
103 
104       if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
105          intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
106       else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
107          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
108       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
109          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
110       else
111          unreachable("Illegal fusing.");
112    }
113 
114    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
115       p.SubsliceHashingTableEnable = true;
116       p.SubsliceHashingTableEnableMask = true;
117    }
118 #elif GFX_VERx10 == 125
119    /* Calculate the set of present pixel pipes, and another set of
120     * present pixel pipes with 2 dual subslices enabled, the latter
121     * will appear on the hashing table with twice the frequency of
122     * pixel pipes with a single dual subslice present.
123     */
124    uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
125    for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
126       if (device->info->ppipe_subslices[p] > 0)
127          ppipe_mask1 |= (1u << p);
128       if (device->info->ppipe_subslices[p] > 1)
129          ppipe_mask2 |= (1u << p);
130    }
131    assert(ppipe_mask1);
132 
133    if (!device->slice_hash.alloc_size) {
134       unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
135       device->slice_hash =
136          anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
137 
138       struct GENX(SLICE_HASH_TABLE) table;
139 
140       /* Note that the hardware expects an array with 7 tables, each
141        * table is intended to specify the pixel pipe hashing behavior
142        * for every possible slice count between 2 and 8, however that
143        * doesn't actually work, among other reasons due to hardware
144        * bugs that will cause the GPU to erroneously access the table
145        * at the wrong index in some cases, so in practice all 7 tables
146        * need to be initialized to the same value.
147        */
148       for (unsigned i = 0; i < 7; i++)
149          intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
150                                              table.Entry[i][0]);
151 
152       GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
153    }
154 
155    anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
156       ptr.SliceHashStatePointerValid = true;
157       ptr.SliceHashTableStatePointer = device->slice_hash.offset;
158    }
159 
160    /* TODO: Figure out FCV support for other platforms
161     * Testing indicates that FCV is broken on MTL, but works fine on DG2.
162     * Let's disable FCV on MTL for now till we figure out what's wrong.
163     *
164     * Alternatively, it can be toggled off via drirc option 'anv_disable_fcv'.
165     *
166     * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9987
167     */
168    anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
169       mode.SliceHashingTableEnable = true;
170       mode.SliceHashingTableEnableMask = true;
171       mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
172 				    hashing32x32 : NormalMode);
173       mode.CrossSliceHashingModeMask = -1;
174       mode.FastClearOptimizationEnable = !device->physical->disable_fcv;
175       mode.FastClearOptimizationEnableMask = !device->physical->disable_fcv;
176    }
177 #endif
178 }
179 
180 static void
init_common_queue_state(struct anv_queue * queue,struct anv_batch * batch)181 init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
182 {
183    UNUSED struct anv_device *device = queue->device;
184 
185 #if GFX_VER >= 11
186    /* Starting with GFX version 11, SLM is no longer part of the L3$ config
187     * so it never changes throughout the lifetime of the VkDevice.
188     */
189    const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
190    genX(emit_l3_config)(batch, device, cfg);
191    device->l3_config = cfg;
192 #endif
193 
194 #if GFX_VERx10 == 125
195    /* Even though L3 partial write merging is supposed to be enabled
196     * by default on Gfx12.5 according to the hardware spec, i915
197     * appears to accidentally clear the enables during context
198     * initialization, so make sure to enable them here since partial
199     * write merging has a large impact on rendering performance.
200     */
201    anv_batch_write_reg(batch, GENX(L3SQCREG5), reg) {
202       reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
203       reg.CompressiblePartialWriteMergeEnable = true;
204       reg.CoherentPartialWriteMergeEnable = true;
205       reg.CrossTilePartialWriteMergeEnable = true;
206    }
207 #endif
208 
209    /* Emit STATE_BASE_ADDRESS on Gfx12+ because we set a default CPS_STATE and
210     * those are relative to STATE_BASE_ADDRESS::DynamicStateBaseAddress.
211     */
212 #if GFX_VER >= 12
213 
214 #if GFX_VERx10 >= 125
215    /* Wa_14016407139:
216     *
217     * "On Surface state base address modification, for 3D workloads, SW must
218     *  always program PIPE_CONTROL either with CS Stall or PS sync stall. In
219     *  both the cases set Render Target Cache Flush Enable".
220     */
221    genx_batch_emit_pipe_control(batch, device->info,
222                                 0,
223                                 ANV_PIPE_CS_STALL_BIT |
224                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
225 #endif
226 
227    /* GEN:BUG:1607854226:
228     *
229     *  Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
230     *  Fortunately, we always start the context off in 3D mode.
231     */
232    uint32_t mocs = device->isl_dev.mocs.internal;
233    anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
234       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
235       sba.GeneralStateBufferSize  = 0xfffff;
236       sba.GeneralStateMOCS = mocs;
237       sba.GeneralStateBaseAddressModifyEnable = true;
238       sba.GeneralStateBufferSizeModifyEnable = true;
239 
240       sba.StatelessDataPortAccessMOCS = mocs;
241 
242       sba.SurfaceStateBaseAddress =
243          (struct anv_address) { .offset =
244          device->physical->va.internal_surface_state_pool.addr,
245       };
246       sba.SurfaceStateMOCS = mocs;
247       sba.SurfaceStateBaseAddressModifyEnable = true;
248 
249       sba.DynamicStateBaseAddress =
250          (struct anv_address) { .offset =
251          device->physical->va.dynamic_state_pool.addr,
252       };
253       sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size +
254                                     device->physical->va.sampler_state_pool.size) / 4096;
255       sba.DynamicStateMOCS = mocs;
256       sba.DynamicStateBaseAddressModifyEnable = true;
257       sba.DynamicStateBufferSizeModifyEnable = true;
258 
259       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
260       sba.IndirectObjectBufferSize = 0xfffff;
261       sba.IndirectObjectMOCS = mocs;
262       sba.IndirectObjectBaseAddressModifyEnable = true;
263       sba.IndirectObjectBufferSizeModifyEnable = true;
264 
265       sba.InstructionBaseAddress =
266          (struct anv_address) { .offset =
267          device->physical->va.instruction_state_pool.addr,
268       };
269       sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096;
270       sba.InstructionMOCS = mocs;
271       sba.InstructionBaseAddressModifyEnable = true;
272       sba.InstructionBuffersizeModifyEnable = true;
273 
274 #if GFX_VER >= 11
275       sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
276       sba.BindlessSamplerStateBufferSize = 0;
277       sba.BindlessSamplerStateMOCS = mocs;
278       sba.BindlessSamplerStateBaseAddressModifyEnable = true;
279 #endif
280 
281       if (device->physical->indirect_descriptors) {
282          sba.BindlessSurfaceStateBaseAddress =
283             (struct anv_address) { .offset =
284             device->physical->va.bindless_surface_state_pool.addr,
285          };
286          sba.BindlessSurfaceStateSize =
287             anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1;
288          sba.BindlessSurfaceStateMOCS = mocs;
289          sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
290       } else {
291          /* Bindless Surface State & Bindless Sampler State are aligned to the
292           * same heap
293           */
294          sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
295             .offset = device->physical->va.internal_surface_state_pool.addr,
296          };
297          sba.BindlessSurfaceStateSize =
298             (device->physical->va.internal_surface_state_pool.size +
299              device->physical->va.bindless_surface_state_pool.size) - 1;
300          sba.BindlessSurfaceStateMOCS = mocs;
301          sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
302       }
303 
304 #if GFX_VERx10 >= 125
305       sba.L1CacheControl = L1CC_WB;
306 #endif
307    }
308 #endif
309 
310 #if GFX_VERx10 >= 125
311    if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
312       anv_batch_emit(batch, GENX(3DSTATE_BTD), btd) {
313          /* TODO: This is the timeout after which the bucketed thread
314           *       dispatcher will kick off a wave of threads. We go with the
315           *       lowest value for now. It could be tweaked on a per
316           *       application basis (drirc).
317           */
318          btd.DispatchTimeoutCounter = _64clocks;
319          /* BSpec 43851: "This field must be programmed to 6h i.e. memory
320           *               backed buffer must be 128KB."
321           */
322          btd.PerDSSMemoryBackedBufferSize = 6;
323          btd.MemoryBackedBufferBasePointer = (struct anv_address) {
324             /* This batch doesn't have a reloc list so we can't use the BO
325              * here.  We just use the address directly.
326              */
327             .offset = device->btd_fifo_bo->offset,
328          };
329       }
330    }
331 #endif
332 }
333 
334 static VkResult
init_render_queue_state(struct anv_queue * queue,bool is_companion_rcs_batch)335 init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
336 {
337    struct anv_device *device = queue->device;
338    UNUSED const struct intel_device_info *devinfo = queue->device->info;
339    uint32_t cmds[256];
340    struct anv_batch batch = {
341       .start = cmds,
342       .next = cmds,
343       .end = (void *) cmds + sizeof(cmds),
344    };
345 
346    struct GENX(VERTEX_ELEMENT_STATE) empty_ve = {
347       .Valid = true,
348       .Component0Control = VFCOMP_STORE_0,
349       .Component1Control = VFCOMP_STORE_0,
350       .Component2Control = VFCOMP_STORE_0,
351       .Component3Control = VFCOMP_STORE_0,
352    };
353    GENX(VERTEX_ELEMENT_STATE_pack)(NULL, device->empty_vs_input, &empty_ve);
354 
355    genX(emit_pipeline_select)(&batch, _3D, device);
356 
357 #if GFX_VER == 9
358    anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
359       cm1.FloatBlendOptimizationEnable = true;
360       cm1.FloatBlendOptimizationEnableMask = true;
361       cm1.MSCRAWHazardAvoidanceBit = true;
362       cm1.MSCRAWHazardAvoidanceBitMask = true;
363       cm1.PartialResolveDisableInVC = true;
364       cm1.PartialResolveDisableInVCMask = true;
365    }
366 #endif
367 
368    anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
369 
370    anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
371       rect.ClippedDrawingRectangleYMin = 0;
372       rect.ClippedDrawingRectangleXMin = 0;
373       rect.ClippedDrawingRectangleYMax = UINT16_MAX;
374       rect.ClippedDrawingRectangleXMax = UINT16_MAX;
375       rect.DrawingRectangleOriginY = 0;
376       rect.DrawingRectangleOriginX = 0;
377    }
378 
379    anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
380 
381    /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
382     *
383     *   "3DSTATE_RASTER if used must be programmed prior to using this
384     *    packet."
385     *
386     * Emit this before 3DSTATE_WM_HZ_OP below.
387     */
388    anv_batch_emit(&batch, GENX(3DSTATE_RASTER), rast) {
389       rast.APIMode = DX101;
390    }
391 
392    /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
393     *
394     *    "3DSTATE_MULTISAMPLE packet must be used prior to this packet to
395     *     change the Number of Multisamples. This packet must not be used to
396     *     change Number of Multisamples in a rendering sequence."
397     *
398     * Emit this before 3DSTATE_WM_HZ_OP below.
399     */
400    anv_batch_emit(&batch, GENX(3DSTATE_MULTISAMPLE), ms);
401 
402    /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
403     * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
404     * Clear." It mentions that the packet overrides GPU state for the clear
405     * operation and needs to be reset to 0s to clear the overrides. Depending
406     * on the kernel, we may not get a context with the state for this packet
407     * zeroed. Do it ourselves just in case. We've observed this to prevent a
408     * number of GPU hangs on ICL.
409     */
410    anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
411 
412    genX(emit_sample_pattern)(&batch, NULL);
413 
414 #if GFX_VER == 11
415    /* The default behavior of bit 5 "Headerless Message for Pre-emptable
416     * Contexts" in SAMPLER MODE register is set to 0, which means
417     * headerless sampler messages are not allowed for pre-emptable
418     * contexts. Set the bit 5 to 1 to allow them.
419     */
420    anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
421       sm.HeaderlessMessageforPreemptableContexts = true;
422       sm.HeaderlessMessageforPreemptableContextsMask = true;
423    }
424 
425    /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
426     * HALF_SLICE_CHICKEN7 register.
427     */
428    anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
429       hsc7.EnabledTexelOffsetPrecisionFix = true;
430       hsc7.EnabledTexelOffsetPrecisionFixMask = true;
431    }
432 
433    anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
434       tcc.L3DataPartialWriteMergingEnable = true;
435       tcc.ColorZPartialWriteMergingEnable = true;
436       tcc.URBPartialWriteMergingEnable = true;
437       tcc.TCDisable = true;
438    }
439 #endif
440    genX(emit_slice_hashing_state)(device, &batch);
441 
442 #if GFX_VER >= 11
443    /* hardware specification recommends disabling repacking for
444     * the compatibility with decompression mechanism in display controller.
445     */
446    if (device->info->disable_ccs_repack) {
447       anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
448          cm0.DisableRepackingforCompression = true;
449          cm0.DisableRepackingforCompressionMask = true;
450       }
451    }
452 
453    /* an unknown issue is causing vs push constants to become
454     * corrupted during object-level preemption. For now, restrict
455     * to command buffer level preemption to avoid rendering
456     * corruption.
457     */
458    anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
459       cc1.ReplayMode = MidcmdbufferPreemption;
460       cc1.ReplayModeMask = true;
461 
462 #if GFX_VERx10 == 120
463       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
464       cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
465 #endif
466    }
467 
468 #if INTEL_NEEDS_WA_1806527549
469    /* Wa_1806527549 says to disable the following HiZ optimization when the
470     * depth buffer is D16_UNORM. We've found the WA to help with more depth
471     * buffer configurations however, so we always disable it just to be safe.
472     */
473    anv_batch_write_reg(&batch, GENX(HIZ_CHICKEN), reg) {
474       reg.HZDepthTestLEGEOptimizationDisable = true;
475       reg.HZDepthTestLEGEOptimizationDisableMask = true;
476    }
477 #endif
478 
479 #if GFX_VER == 12
480    anv_batch_write_reg(&batch, GENX(FF_MODE2), reg) {
481       /* On Alchemist, the FF_MODE2 docs for the GS timer say:
482        *
483        *    "The timer value must be set to 224."
484        *
485        * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
486        * and that this is necessary to avoid hanging the HS/DS units.  It
487        * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
488        *
489        * The HS timer docs also have the same quote for Alchemist.  I am
490        * unaware of a reason it needs to be set to 224 on Tigerlake, but
491        * we do so for consistency if nothing else.
492        *
493        * For the TDS timer value, the docs say:
494        *
495        *    "For best performance, a value of 4 should be programmed."
496        *
497        * i915 also sets it this way on Tigerlake due to workarounds.
498        *
499        * The default VS timer appears to be 0, so we leave it at that.
500        */
501       reg.GSTimerValue  = 224;
502       reg.HSTimerValue  = 224;
503       reg.TDSTimerValue = 4;
504       reg.VSTimerValue  = 0;
505    }
506 #endif
507 
508 #if INTEL_NEEDS_WA_1508744258
509    /*    Disable RHWO by setting 0x7010[14] by default except during resolve
510     *    pass.
511     *
512     * We implement global disabling of the optimization here and we toggle it
513     * in anv_image_ccs_op().
514     */
515    anv_batch_write_reg(&batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
516       c1.RCCRHWOOptimizationDisable = true;
517       c1.RCCRHWOOptimizationDisableMask = true;
518    }
519 #endif
520 
521 #if GFX_VERx10 < 125
522 #define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
523 #else
524 #define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
525 #endif
526 
527    /* Enable the new line drawing algorithm that produces higher quality
528     * lines.
529     */
530    anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
531       c3.AALineQualityFix = true;
532       c3.AALineQualityFixMask = true;
533    }
534 #endif
535 
536 #if GFX_VER == 12
537    if (device->info->has_aux_map) {
538       uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
539       assert(aux_base_addr % (32 * 1024) == 0);
540       anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
541          lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
542          lri.DataDWord = aux_base_addr & 0xffffffff;
543       }
544       anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
545          lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
546          lri.DataDWord = aux_base_addr >> 32;
547       }
548    }
549 #endif
550 
551 #if GFX_VERx10 == 125
552    anv_batch_write_reg(&batch, GENX(CHICKEN_RASTER_2), reg) {
553       reg.TBIMRBatchSizeOverride = true;
554       reg.TBIMROpenBatchEnable = true;
555       reg.TBIMRFastClip = true;
556       reg.TBIMRBatchSizeOverrideMask = true;
557       reg.TBIMROpenBatchEnableMask = true;
558       reg.TBIMRFastClipMask = true;
559    }
560 #endif
561 
562    /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
563     * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
564     *
565     * This is only safe on kernels with context isolation support.
566     */
567    assert(device->physical->info.has_context_isolation);
568    anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
569       csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
570       csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
571    }
572 
573    init_common_queue_state(queue, &batch);
574 
575    /* Because 3DSTATE_CPS::CoarsePixelShadingStateArrayPointer is relative to
576     * the dynamic state base address we need to emit this instruction after
577     * STATE_BASE_ADDRESS in init_common_queue_state().
578     */
579 #if GFX_VER == 11
580    anv_batch_emit(&batch, GENX(3DSTATE_CPS), cps);
581 #elif GFX_VER >= 12
582    anv_batch_emit(&batch, GENX(3DSTATE_CPS_POINTERS), cps) {
583       assert(device->cps_states.alloc_size != 0);
584       /* Offset 0 is the disabled state */
585       cps.CoarsePixelShadingStateArrayPointer =
586          device->cps_states.offset;
587    }
588 #endif
589 
590 #if GFX_VERx10 >= 125
591    anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), zero);
592    anv_batch_emit(&batch, GENX(3DSTATE_MESH_CONTROL), zero);
593    anv_batch_emit(&batch, GENX(3DSTATE_TASK_CONTROL), zero);
594 
595    /* We no longer required to explicitly flush or invalidate caches since the
596     * PIPELINE_SELECT is getting deprecated on Xe2+.
597     */
598 #if GFX_VER < 20
599    genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
600                                       ANV_NULL_ADDRESS,
601                                       0,
602                                       ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
603 #endif
604 
605    genX(emit_pipeline_select)(&batch, GPGPU, device);
606    anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
607       cfe.MaximumNumberofThreads =
608          devinfo->max_cs_threads * devinfo->subslice_total;
609    }
610 
611    /* We no longer required to explicitly flush or invalidate caches since the
612     * PIPELINE_SELECT is getting deprecated on Xe2+.
613     */
614 #if GFX_VER < 20
615    genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
616                                       ANV_NULL_ADDRESS,
617                                       0,
618                                       ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
619 #endif
620 
621    genX(emit_pipeline_select)(&batch, _3D, device);
622 #endif
623 
624    anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
625 
626    assert(batch.next <= batch.end);
627 
628    if (!device->trtt.queue)
629       device->trtt.queue = queue;
630 
631    return anv_queue_submit_simple_batch(queue, &batch, is_companion_rcs_batch);
632 }
633 
634 static VkResult
init_compute_queue_state(struct anv_queue * queue)635 init_compute_queue_state(struct anv_queue *queue)
636 {
637    UNUSED const struct intel_device_info *devinfo = queue->device->info;
638    uint32_t cmds[64];
639    struct anv_batch batch = {
640       .start = cmds,
641       .next = cmds,
642       .end = (void *) cmds + sizeof(cmds),
643    };
644 
645    genX(emit_pipeline_select)(&batch, GPGPU, queue->device);
646 
647 #if GFX_VER == 12
648    if (queue->device->info->has_aux_map) {
649       uint64_t aux_base_addr =
650          intel_aux_map_get_base(queue->device->aux_map_ctx);
651       assert(aux_base_addr % (32 * 1024) == 0);
652       anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
653          lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
654          lri.DataDWord = aux_base_addr & 0xffffffff;
655       }
656       anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
657          lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num) + 4;
658          lri.DataDWord = aux_base_addr >> 32;
659       }
660    }
661 #else
662    assert(!queue->device->info->has_aux_map);
663 #endif
664 
665    /* Wa_14015782607 - Issue pipe control with HDC_flush and
666     * untyped cache flush set to 1 when CCS has NP state update with
667     * STATE_COMPUTE_MODE.
668     */
669    if (intel_needs_workaround(devinfo, 14015782607) &&
670        queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
671       genx_batch_emit_pipe_control(&batch, devinfo, GPGPU,
672                                    ANV_PIPE_CS_STALL_BIT |
673                                    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
674                                    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
675    }
676 
677 #if GFX_VERx10 >= 125
678    /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
679     * emitting NP state commands with ATS-M in compute mode.
680     */
681    if (intel_device_info_is_atsm(devinfo) &&
682        queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
683       genx_batch_emit_pipe_control
684          (&batch, devinfo, GPGPU,
685           ANV_PIPE_CS_STALL_BIT |
686           ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
687           ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
688           ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
689           ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
690           ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
691           ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
692    }
693 
694    anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), cm) {
695       cm.PixelAsyncComputeThreadLimit = 4;
696       cm.PixelAsyncComputeThreadLimitMask = 0x7;
697    }
698 #endif
699 
700    init_common_queue_state(queue, &batch);
701 
702 #if GFX_VERx10 >= 125
703    anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
704       cfe.MaximumNumberofThreads =
705          devinfo->max_cs_threads * devinfo->subslice_total;
706    }
707 #endif
708 
709    anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
710 
711    assert(batch.next <= batch.end);
712 
713    return anv_queue_submit_simple_batch(queue, &batch,
714                                         false /* is_companion_rcs_batch */);
715 }
716 
717 static VkResult
init_copy_video_queue_state(struct anv_queue * queue)718 init_copy_video_queue_state(struct anv_queue *queue)
719 {
720 #if GFX_VER >= 12
721    UNUSED const struct intel_device_info *devinfo = queue->device->info;
722    uint32_t cmds[64];
723    UNUSED struct anv_batch batch = {
724       .start = cmds,
725       .next = cmds,
726       .end = (void *) cmds + sizeof(cmds),
727    };
728 
729    if (queue->device->info->has_aux_map) {
730       uint64_t reg = GENX(VD0_AUX_TABLE_BASE_ADDR_num);
731 
732       if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY) {
733 #if GFX_VERx10 >= 125
734          reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
735 #endif
736       }
737 
738       uint64_t aux_base_addr =
739          intel_aux_map_get_base(queue->device->aux_map_ctx);
740       assert(aux_base_addr % (32 * 1024) == 0);
741       anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
742          lri.RegisterOffset = reg;
743          lri.DataDWord = aux_base_addr & 0xffffffff;
744       }
745       anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
746          lri.RegisterOffset = reg + 4;
747          lri.DataDWord = aux_base_addr >> 32;
748       }
749 
750       anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
751       assert(batch.next <= batch.end);
752 
753       return anv_queue_submit_simple_batch(queue, &batch,
754                                            false /* is_companion_rcs_batch */);
755    }
756 #else
757    assert(!queue->device->info->has_aux_map);
758 #endif
759 
760    return VK_SUCCESS;
761 }
762 
763 void
genX(init_physical_device_state)764 genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
765 {
766    assert(pdevice->info.verx10 == GFX_VERx10);
767 #if GFX_VERx10 == 125 && ANV_SUPPORT_RT
768    genX(grl_load_rt_uuid)(pdevice->rt_uuid);
769    pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
770 #endif
771 
772    pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
773 }
774 
775 VkResult
genX(init_device_state)776 genX(init_device_state)(struct anv_device *device)
777 {
778    VkResult res;
779 
780    device->slice_hash = (struct anv_state) { 0 };
781    for (uint32_t i = 0; i < device->queue_count; i++) {
782       struct anv_queue *queue = &device->queues[i];
783       switch (queue->family->engine_class) {
784       case INTEL_ENGINE_CLASS_RENDER:
785          res = init_render_queue_state(queue, false /* is_companion_rcs_batch */);
786          break;
787       case INTEL_ENGINE_CLASS_COMPUTE: {
788          res = init_compute_queue_state(queue);
789          if (res != VK_SUCCESS)
790             return res;
791 
792          /**
793           * Execute RCS init batch by default on the companion RCS command buffer in
794           * order to support MSAA copy/clear operations on compute queue.
795           */
796          res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
797          break;
798       }
799       case INTEL_ENGINE_CLASS_VIDEO:
800          res = init_copy_video_queue_state(queue);
801          break;
802       case INTEL_ENGINE_CLASS_COPY:
803          res = init_copy_video_queue_state(queue);
804          if (res != VK_SUCCESS)
805             return res;
806 
807          /**
808           * Execute RCS init batch by default on the companion RCS command buffer in
809           * order to support MSAA copy/clear operations on copy queue.
810           */
811          res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
812          break;
813       default:
814          res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
815          break;
816       }
817       if (res != VK_SUCCESS)
818          return res;
819    }
820 
821    return res;
822 }
823 
824 #if GFX_VERx10 >= 125
825 #define maybe_for_each_shading_rate_op(name) \
826    for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
827         name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
828         name++)
829 #elif GFX_VER >= 12
830 #define maybe_for_each_shading_rate_op(name)
831 #endif
832 
833 /* Rather than reemitting the CPS_STATE structure everything those changes and
834  * for as many viewports as needed, we can just prepare all possible cases and
835  * just pick the right offset from the prepacked states when needed.
836  */
837 void
genX(init_cps_device_state)838 genX(init_cps_device_state)(struct anv_device *device)
839 {
840 #if GFX_VER >= 12
841    void *cps_state_ptr = device->cps_states.map;
842 
843    /* Disabled CPS mode */
844    for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
845       /* ICL PRMs, Volume 2d: Command Reference: Structures: 3DSTATE_CPS_BODY:
846        *
847        *   "It is an INVALID configuration to set the CPS mode other than
848        *    CPS_MODE_NONE and request per-sample dispatch in 3DSTATE_PS_EXTRA.
849        *    Such configuration should be disallowed at the API level, and
850        *    rendering results are undefined."
851        *
852        * Since we select this state when per coarse pixel is disabled and that
853        * includes when per-sample dispatch is enabled, we need to ensure this
854        * is set to NONE.
855        */
856       struct GENX(CPS_STATE) cps_state = {
857          .CoarsePixelShadingMode = CPS_MODE_NONE,
858       };
859 
860       GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
861       cps_state_ptr += GENX(CPS_STATE_length) * 4;
862    }
863 
864    maybe_for_each_shading_rate_op(op0) {
865       maybe_for_each_shading_rate_op(op1) {
866          for (uint32_t x = 1; x <= 4; x *= 2) {
867             for (uint32_t y = 1; y <= 4; y *= 2) {
868                struct GENX(CPS_STATE) cps_state = {
869                   .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
870                   .MinCPSizeX = x,
871                   .MinCPSizeY = y,
872                };
873 
874 #if GFX_VERx10 >= 125
875                static const uint32_t combiner_ops[] = {
876                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR]    = PASSTHROUGH,
877                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
878                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR]     = HIGH_QUALITY,
879                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR]     = LOW_QUALITY,
880                   [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR]     = RELATIVE,
881                };
882 
883                cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
884                cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
885 #endif /* GFX_VERx10 >= 125 */
886 
887                for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
888                   GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
889                   cps_state_ptr += GENX(CPS_STATE_length) * 4;
890                }
891             }
892          }
893       }
894    }
895 #endif /* GFX_VER >= 12 */
896 }
897 
898 void
genX(emit_l3_config)899 genX(emit_l3_config)(struct anv_batch *batch,
900                      const struct anv_device *device,
901                      const struct intel_l3_config *cfg)
902 {
903 #if GFX_VER < 20
904    UNUSED const struct intel_device_info *devinfo = device->info;
905 
906 #if GFX_VER >= 12
907 #define L3_ALLOCATION_REG GENX(L3ALLOC)
908 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
909 #else
910 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
911 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
912 #endif
913 
914    anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
915       if (cfg == NULL || (GFX_VER >= 12 && cfg->n[INTEL_L3P_ALL] > 126)) {
916          assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
917                           cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
918                           cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
919                           cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
920 #if GFX_VER >= 12
921          l3cr.L3FullWayAllocationEnable = true;
922 #else
923          unreachable("Invalid L3$ config");
924 #endif
925       } else {
926 #if GFX_VER < 11
927          l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
928 #endif
929 #if INTEL_NEEDS_WA_1406697149
930          /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
931           * set in L3CNTLREG register. The default setting of the bit is not
932           * the desirable behavior.
933           */
934          l3cr.ErrorDetectionBehaviorControl = true;
935          l3cr.UseFullWays = true;
936 #endif /* INTEL_NEEDS_WA_1406697149 */
937          assert(cfg->n[INTEL_L3P_IS] == 0);
938          assert(cfg->n[INTEL_L3P_C] == 0);
939          assert(cfg->n[INTEL_L3P_T] == 0);
940          l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
941          l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
942          l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
943          l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
944       }
945    }
946 #endif /* GFX_VER < 20 */
947 }
948 
949 void
genX(emit_sample_pattern)950 genX(emit_sample_pattern)(struct anv_batch *batch,
951                           const struct vk_sample_locations_state *sl)
952 {
953    assert(sl == NULL || sl->grid_size.width == 1);
954    assert(sl == NULL || sl->grid_size.height == 1);
955 
956    /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
957     * VkPhysicalDeviceFeatures::standardSampleLocations.
958     */
959    anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
960       /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
961        *
962        *    "When programming the sample offsets (for NUMSAMPLES_4 or _8
963        *    and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
964        *    (or 7 for 8X, or 15 for 16X) must have monotonically increasing
965        *    distance from the pixel center. This is required to get the
966        *    correct centroid computation in the device."
967        *
968        * However, the Vulkan spec seems to require that the the samples occur
969        * in the order provided through the API. The standard sample patterns
970        * have the above property that they have monotonically increasing
971        * distances from the center but client-provided ones do not. As long as
972        * this only affects centroid calculations as the docs say, we should be
973        * ok because OpenGL and Vulkan only require that the centroid be some
974        * lit sample and that it's the same for all samples in a pixel; they
975        * have no requirement that it be the one closest to center.
976        */
977       for (uint32_t i = 1; i <= 16; i *= 2) {
978          switch (i) {
979          case VK_SAMPLE_COUNT_1_BIT:
980             if (sl && sl->per_pixel == i) {
981                INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
982             } else {
983                INTEL_SAMPLE_POS_1X(sp._1xSample);
984             }
985             break;
986          case VK_SAMPLE_COUNT_2_BIT:
987             if (sl && sl->per_pixel == i) {
988                INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
989             } else {
990                INTEL_SAMPLE_POS_2X(sp._2xSample);
991             }
992             break;
993          case VK_SAMPLE_COUNT_4_BIT:
994             if (sl && sl->per_pixel == i) {
995                INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
996             } else {
997                INTEL_SAMPLE_POS_4X(sp._4xSample);
998             }
999             break;
1000          case VK_SAMPLE_COUNT_8_BIT:
1001             if (sl && sl->per_pixel == i) {
1002                INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
1003             } else {
1004                INTEL_SAMPLE_POS_8X(sp._8xSample);
1005             }
1006             break;
1007          case VK_SAMPLE_COUNT_16_BIT:
1008             if (sl && sl->per_pixel == i) {
1009                INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
1010             } else {
1011                INTEL_SAMPLE_POS_16X(sp._16xSample);
1012             }
1013             break;
1014          default:
1015             unreachable("Invalid sample count");
1016          }
1017       }
1018    }
1019 }
1020 
1021 static uint32_t
vk_to_intel_tex_filter(VkFilter filter,bool anisotropyEnable)1022 vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
1023 {
1024    switch (filter) {
1025    default:
1026       unreachable("Invalid filter");
1027    case VK_FILTER_NEAREST:
1028       return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
1029    case VK_FILTER_LINEAR:
1030       return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
1031    }
1032 }
1033 
1034 static uint32_t
vk_to_intel_max_anisotropy(float ratio)1035 vk_to_intel_max_anisotropy(float ratio)
1036 {
1037    return (CLAMP(ratio, 2, 16) - 2) / 2;
1038 }
1039 
1040 static const uint32_t vk_to_intel_mipmap_mode[] = {
1041    [VK_SAMPLER_MIPMAP_MODE_NEAREST]          = MIPFILTER_NEAREST,
1042    [VK_SAMPLER_MIPMAP_MODE_LINEAR]           = MIPFILTER_LINEAR
1043 };
1044 
1045 static const uint32_t vk_to_intel_tex_address[] = {
1046    [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = TCM_WRAP,
1047    [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
1048    [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = TCM_CLAMP,
1049    [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
1050    [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
1051 };
1052 
1053 /* Vulkan specifies the result of shadow comparisons as:
1054  *     1     if   ref <op> texel,
1055  *     0     otherwise.
1056  *
1057  * The hardware does:
1058  *     0     if texel <op> ref,
1059  *     1     otherwise.
1060  *
1061  * So, these look a bit strange because there's both a negation
1062  * and swapping of the arguments involved.
1063  */
1064 static const uint32_t vk_to_intel_shadow_compare_op[] = {
1065    [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_ALWAYS,
1066    [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LEQUAL,
1067    [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_NOTEQUAL,
1068    [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LESS,
1069    [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GEQUAL,
1070    [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_EQUAL,
1071    [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GREATER,
1072    [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_NEVER,
1073 };
1074 
1075 static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
1076    [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
1077    [VK_SAMPLER_REDUCTION_MODE_MIN]              = MINIMUM,
1078    [VK_SAMPLER_REDUCTION_MODE_MAX]              = MAXIMUM,
1079 };
1080 
genX(CreateSampler)1081 VkResult genX(CreateSampler)(
1082     VkDevice                                    _device,
1083     const VkSamplerCreateInfo*                  pCreateInfo,
1084     const VkAllocationCallbacks*                pAllocator,
1085     VkSampler*                                  pSampler)
1086 {
1087    ANV_FROM_HANDLE(anv_device, device, _device);
1088    struct anv_sampler *sampler;
1089 
1090    sampler = vk_sampler_create(&device->vk, pCreateInfo,
1091                                pAllocator, sizeof(*sampler));
1092    if (!sampler)
1093       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1094 
1095    const struct vk_format_ycbcr_info *ycbcr_info =
1096       sampler->vk.format != VK_FORMAT_UNDEFINED ?
1097       vk_format_get_ycbcr_info(sampler->vk.format) : NULL;
1098    assert((ycbcr_info == NULL) == (sampler->vk.ycbcr_conversion == NULL));
1099 
1100    sampler->n_planes = ycbcr_info ? ycbcr_info->n_planes : 1;
1101 
1102    uint32_t border_color_stride = 64;
1103    uint32_t border_color_offset;
1104    if (sampler->vk.border_color <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
1105       border_color_offset = device->border_colors.offset +
1106                             pCreateInfo->borderColor *
1107                             border_color_stride;
1108    } else {
1109       assert(vk_border_color_is_custom(sampler->vk.border_color));
1110       sampler->custom_border_color =
1111          anv_state_reserved_pool_alloc(&device->custom_border_colors);
1112       border_color_offset = sampler->custom_border_color.offset;
1113 
1114       union isl_color_value color = { .u32 = {
1115          sampler->vk.border_color_value.uint32[0],
1116          sampler->vk.border_color_value.uint32[1],
1117          sampler->vk.border_color_value.uint32[2],
1118          sampler->vk.border_color_value.uint32[3],
1119       } };
1120 
1121       const struct anv_format *format_desc =
1122          sampler->vk.format != VK_FORMAT_UNDEFINED ?
1123          anv_get_format(sampler->vk.format) : NULL;
1124 
1125       if (format_desc && format_desc->n_planes == 1 &&
1126           !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
1127          const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
1128 
1129          assert(!isl_format_has_int_channel(fmt_plane->isl_format));
1130          color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
1131       }
1132 
1133       memcpy(sampler->custom_border_color.map, color.u32, sizeof(color));
1134    }
1135 
1136    /* If we have bindless, allocate enough samplers.  We allocate 32 bytes
1137     * for each sampler instead of 16 bytes because we want all bindless
1138     * samplers to be 32-byte aligned so we don't have to use indirect
1139     * sampler messages on them.
1140     */
1141    sampler->bindless_state =
1142       anv_state_pool_alloc(&device->dynamic_state_pool,
1143                            sampler->n_planes * 32, 32);
1144 
1145    const bool seamless_cube =
1146       !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
1147 
1148    for (unsigned p = 0; p < sampler->n_planes; p++) {
1149       const bool plane_has_chroma =
1150          ycbcr_info && ycbcr_info->planes[p].has_chroma;
1151       const VkFilter min_filter =
1152          plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1153                             pCreateInfo->minFilter;
1154       const VkFilter mag_filter =
1155          plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1156                             pCreateInfo->magFilter;
1157       const bool force_addr_rounding =
1158             device->physical->instance->force_filter_addr_rounding;
1159       const bool enable_min_filter_addr_rounding =
1160             force_addr_rounding || min_filter != VK_FILTER_NEAREST;
1161       const bool enable_mag_filter_addr_rounding =
1162             force_addr_rounding || mag_filter != VK_FILTER_NEAREST;
1163       /* From Broadwell PRM, SAMPLER_STATE:
1164        *   "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
1165        */
1166       enum isl_format plane0_isl_format = sampler->vk.ycbcr_conversion ?
1167          anv_get_format(sampler->vk.format)->planes[0].isl_format :
1168          ISL_FORMAT_UNSUPPORTED;
1169       const bool isl_format_is_planar_yuv =
1170          plane0_isl_format != ISL_FORMAT_UNSUPPORTED &&
1171          isl_format_is_yuv(plane0_isl_format) &&
1172          isl_format_is_planar(plane0_isl_format);
1173 
1174       const uint32_t mip_filter_mode =
1175          isl_format_is_planar_yuv ?
1176          MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
1177 
1178       struct GENX(SAMPLER_STATE) sampler_state = {
1179          .SamplerDisable = false,
1180          .TextureBorderColorMode = DX10OGL,
1181 
1182 #if GFX_VER >= 11
1183          .CPSLODCompensationEnable = true,
1184 #endif
1185 
1186          .LODPreClampMode = CLAMP_MODE_OGL,
1187 
1188          .MipModeFilter = mip_filter_mode,
1189          .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
1190          .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
1191          .TextureLODBias = CLAMP(pCreateInfo->mipLodBias, -16, 15.996),
1192          .AnisotropicAlgorithm =
1193             pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
1194          .MinLOD = CLAMP(pCreateInfo->minLod, 0, 14),
1195          .MaxLOD = CLAMP(pCreateInfo->maxLod, 0, 14),
1196          .ChromaKeyEnable = 0,
1197          .ChromaKeyIndex = 0,
1198          .ChromaKeyMode = 0,
1199          .ShadowFunction =
1200             vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
1201                                         pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
1202          .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
1203 
1204          .BorderColorPointer = border_color_offset,
1205 
1206          .LODClampMagnificationMode = MIPNONE,
1207 
1208          .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
1209          .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1210          .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1211          .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1212          .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1213          .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1214          .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1215          .TrilinearFilterQuality = 0,
1216          .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
1217          .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
1218          .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
1219          .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
1220 
1221          .ReductionType =
1222             vk_to_intel_sampler_reduction_mode[sampler->vk.reduction_mode],
1223          .ReductionTypeEnable =
1224             sampler->vk.reduction_mode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE,
1225       };
1226 
1227       GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
1228 
1229       if (sampler->bindless_state.map) {
1230          memcpy(sampler->bindless_state.map + p * 32,
1231                 sampler->state[p], GENX(SAMPLER_STATE_length) * 4);
1232       }
1233    }
1234 
1235    *pSampler = anv_sampler_to_handle(sampler);
1236 
1237    return VK_SUCCESS;
1238 }
1239 
1240 /* Wa_14015814527
1241  *
1242  * Check if task shader was utilized within cmd_buffer, if so
1243  * commit empty URB states and null prim.
1244  */
1245 void
genX(apply_task_urb_workaround)1246 genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
1247 {
1248    if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
1249       return;
1250 
1251 #if GFX_VERx10 >= 125
1252    const struct intel_device_info *devinfo = &cmd_buffer->device->physical->info;
1253 
1254    if (!intel_needs_workaround(devinfo, 16014390852))
1255       return;
1256 
1257    if (cmd_buffer->state.current_pipeline != _3D ||
1258        !cmd_buffer->state.gfx.used_task_shader)
1259       return;
1260 
1261    cmd_buffer->state.gfx.used_task_shader = false;
1262 
1263    /* Wa_14015821291 mentions that WA below is not required if we have
1264     * a pipeline flush going on. It will get flushed during
1265     * cmd_buffer_flush_state before draw.
1266     */
1267    if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_CS_STALL_BIT))
1268       return;
1269 
1270    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
1271       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
1272          urb._3DCommandSubOpcode += i;
1273       }
1274    }
1275 
1276    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
1277    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
1278 
1279    /* Issue 'nullprim' to commit the state. */
1280    genx_batch_emit_pipe_control_write
1281       (&cmd_buffer->batch, cmd_buffer->device->info,
1282        cmd_buffer->state.current_pipeline,
1283        WriteImmediateData, cmd_buffer->device->workaround_address, 0, 0);
1284 #endif
1285 }
1286 
1287 VkResult
genX(init_trtt_context_state)1288 genX(init_trtt_context_state)(struct anv_queue *queue)
1289 {
1290 #if GFX_VER >= 12
1291    struct anv_device *device = queue->device;
1292    struct anv_trtt *trtt = &device->trtt;
1293 
1294    uint32_t cmds[128];
1295    struct anv_batch batch = {
1296       .start = cmds,
1297       .next = cmds,
1298       .end = (void *)cmds + sizeof(cmds),
1299    };
1300 
1301    anv_batch_write_reg(&batch, GENX(GFX_TRTT_INVAL), trtt_inval) {
1302       trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1303    }
1304    anv_batch_write_reg(&batch, GENX(GFX_TRTT_NULL), trtt_null) {
1305       trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1306    }
1307    anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
1308       trtt_va_range.TRVAMaskValue = 0xF;
1309       trtt_va_range.TRVADataValue = 0xF;
1310    }
1311 
1312    uint64_t l3_addr = trtt->l3_addr;
1313    assert((l3_addr & 0xFFF) == 0);
1314    anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) {
1315       trtt_base_low.TRVAL3PointerLowerAddress =
1316          (l3_addr & 0xFFFFF000) >> 12;
1317    }
1318    anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_HIGH),
1319          trtt_base_high) {
1320       trtt_base_high.TRVAL3PointerUpperAddress =
1321          (l3_addr >> 32) & 0xFFFF;
1322    }
1323    /* Enabling TR-TT needs to be done after setting up the other registers.
1324    */
1325    anv_batch_write_reg(&batch, GENX(GFX_TRTT_CR), trtt_cr) {
1326       trtt_cr.TRTTEnable = true;
1327    }
1328 
1329    anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
1330    assert(batch.next <= batch.end);
1331 
1332    VkResult res = anv_queue_submit_simple_batch(queue, &batch, false);
1333    if (res != VK_SUCCESS)
1334       return res;
1335 
1336 #endif
1337    return VK_SUCCESS;
1338 }
1339