1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "common/intel_aux_map.h"
27 #include "common/intel_sample_positions.h"
28 #include "common/intel_pixel_hash.h"
29 #include "genxml/gen_macros.h"
30 #include "genxml/genX_pack.h"
31
32 #include "vk_standard_sample_locations.h"
33
34 #if GFX_VERx10 == 125 && ANV_SUPPORT_RT
35 #include "grl/genX_grl.h"
36 #endif
37
38 #include "vk_util.h"
39 #include "vk_format.h"
40
41 static void
genX(emit_slice_hashing_state)42 genX(emit_slice_hashing_state)(struct anv_device *device,
43 struct anv_batch *batch)
44 {
45 #if GFX_VER == 11
46 /* Gfx11 hardware has two pixel pipes at most. */
47 for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++)
48 assert(device->info->ppipe_subslices[i] == 0);
49
50 if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1])
51 return;
52
53 if (!device->slice_hash.alloc_size) {
54 unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
55 device->slice_hash =
56 anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
57
58 const bool flip = device->info->ppipe_subslices[0] <
59 device->info->ppipe_subslices[1];
60 struct GENX(SLICE_HASH_TABLE) table;
61 intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
62
63 GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
64 }
65
66 anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
67 ptr.SliceHashStatePointerValid = true;
68 ptr.SliceHashTableStatePointer = device->slice_hash.offset;
69 }
70
71 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
72 mode.SliceHashingTableEnable = true;
73 }
74 #elif GFX_VERx10 == 120
75 /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
76 * present with n active dual subslices.
77 */
78 unsigned ppipes_of[3] = {};
79
80 for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
81 for (unsigned p = 0; p < 3; p++)
82 ppipes_of[n] += (device->info->ppipe_subslices[p] == n);
83 }
84
85 /* Gfx12 has three pixel pipes. */
86 for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++)
87 assert(device->info->ppipe_subslices[p] == 0);
88
89 if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
90 /* All three pixel pipes have the maximum number of active dual
91 * subslices, or there is only one active pixel pipe: Nothing to do.
92 */
93 return;
94 }
95
96 anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
97 p.SliceHashControl[0] = TABLE_0;
98
99 if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
100 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
101 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
102 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
103
104 if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
105 intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
106 else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
107 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
108 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
109 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
110 else
111 unreachable("Illegal fusing.");
112 }
113
114 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
115 p.SubsliceHashingTableEnable = true;
116 p.SubsliceHashingTableEnableMask = true;
117 }
118 #elif GFX_VERx10 == 125
119 /* Calculate the set of present pixel pipes, and another set of
120 * present pixel pipes with 2 dual subslices enabled, the latter
121 * will appear on the hashing table with twice the frequency of
122 * pixel pipes with a single dual subslice present.
123 */
124 uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
125 for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
126 if (device->info->ppipe_subslices[p] > 0)
127 ppipe_mask1 |= (1u << p);
128 if (device->info->ppipe_subslices[p] > 1)
129 ppipe_mask2 |= (1u << p);
130 }
131 assert(ppipe_mask1);
132
133 if (!device->slice_hash.alloc_size) {
134 unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
135 device->slice_hash =
136 anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
137
138 struct GENX(SLICE_HASH_TABLE) table;
139
140 /* Note that the hardware expects an array with 7 tables, each
141 * table is intended to specify the pixel pipe hashing behavior
142 * for every possible slice count between 2 and 8, however that
143 * doesn't actually work, among other reasons due to hardware
144 * bugs that will cause the GPU to erroneously access the table
145 * at the wrong index in some cases, so in practice all 7 tables
146 * need to be initialized to the same value.
147 */
148 for (unsigned i = 0; i < 7; i++)
149 intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
150 table.Entry[i][0]);
151
152 GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
153 }
154
155 anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
156 ptr.SliceHashStatePointerValid = true;
157 ptr.SliceHashTableStatePointer = device->slice_hash.offset;
158 }
159
160 /* TODO: Figure out FCV support for other platforms
161 * Testing indicates that FCV is broken on MTL, but works fine on DG2.
162 * Let's disable FCV on MTL for now till we figure out what's wrong.
163 *
164 * Alternatively, it can be toggled off via drirc option 'anv_disable_fcv'.
165 *
166 * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9987
167 */
168 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
169 mode.SliceHashingTableEnable = true;
170 mode.SliceHashingTableEnableMask = true;
171 mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
172 hashing32x32 : NormalMode);
173 mode.CrossSliceHashingModeMask = -1;
174 mode.FastClearOptimizationEnable = !device->physical->disable_fcv;
175 mode.FastClearOptimizationEnableMask = !device->physical->disable_fcv;
176 }
177 #endif
178 }
179
180 static void
init_common_queue_state(struct anv_queue * queue,struct anv_batch * batch)181 init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
182 {
183 UNUSED struct anv_device *device = queue->device;
184
185 #if GFX_VER >= 11
186 /* Starting with GFX version 11, SLM is no longer part of the L3$ config
187 * so it never changes throughout the lifetime of the VkDevice.
188 */
189 const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
190 genX(emit_l3_config)(batch, device, cfg);
191 device->l3_config = cfg;
192 #endif
193
194 #if GFX_VERx10 == 125
195 /* Even though L3 partial write merging is supposed to be enabled
196 * by default on Gfx12.5 according to the hardware spec, i915
197 * appears to accidentally clear the enables during context
198 * initialization, so make sure to enable them here since partial
199 * write merging has a large impact on rendering performance.
200 */
201 anv_batch_write_reg(batch, GENX(L3SQCREG5), reg) {
202 reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
203 reg.CompressiblePartialWriteMergeEnable = true;
204 reg.CoherentPartialWriteMergeEnable = true;
205 reg.CrossTilePartialWriteMergeEnable = true;
206 }
207 #endif
208
209 /* Emit STATE_BASE_ADDRESS on Gfx12+ because we set a default CPS_STATE and
210 * those are relative to STATE_BASE_ADDRESS::DynamicStateBaseAddress.
211 */
212 #if GFX_VER >= 12
213
214 #if GFX_VERx10 >= 125
215 /* Wa_14016407139:
216 *
217 * "On Surface state base address modification, for 3D workloads, SW must
218 * always program PIPE_CONTROL either with CS Stall or PS sync stall. In
219 * both the cases set Render Target Cache Flush Enable".
220 */
221 genx_batch_emit_pipe_control(batch, device->info,
222 0,
223 ANV_PIPE_CS_STALL_BIT |
224 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
225 #endif
226
227 /* GEN:BUG:1607854226:
228 *
229 * Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
230 * Fortunately, we always start the context off in 3D mode.
231 */
232 uint32_t mocs = device->isl_dev.mocs.internal;
233 anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
234 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
235 sba.GeneralStateBufferSize = 0xfffff;
236 sba.GeneralStateMOCS = mocs;
237 sba.GeneralStateBaseAddressModifyEnable = true;
238 sba.GeneralStateBufferSizeModifyEnable = true;
239
240 sba.StatelessDataPortAccessMOCS = mocs;
241
242 sba.SurfaceStateBaseAddress =
243 (struct anv_address) { .offset =
244 device->physical->va.internal_surface_state_pool.addr,
245 };
246 sba.SurfaceStateMOCS = mocs;
247 sba.SurfaceStateBaseAddressModifyEnable = true;
248
249 sba.DynamicStateBaseAddress =
250 (struct anv_address) { .offset =
251 device->physical->va.dynamic_state_pool.addr,
252 };
253 sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size +
254 device->physical->va.sampler_state_pool.size) / 4096;
255 sba.DynamicStateMOCS = mocs;
256 sba.DynamicStateBaseAddressModifyEnable = true;
257 sba.DynamicStateBufferSizeModifyEnable = true;
258
259 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
260 sba.IndirectObjectBufferSize = 0xfffff;
261 sba.IndirectObjectMOCS = mocs;
262 sba.IndirectObjectBaseAddressModifyEnable = true;
263 sba.IndirectObjectBufferSizeModifyEnable = true;
264
265 sba.InstructionBaseAddress =
266 (struct anv_address) { .offset =
267 device->physical->va.instruction_state_pool.addr,
268 };
269 sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096;
270 sba.InstructionMOCS = mocs;
271 sba.InstructionBaseAddressModifyEnable = true;
272 sba.InstructionBuffersizeModifyEnable = true;
273
274 #if GFX_VER >= 11
275 sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
276 sba.BindlessSamplerStateBufferSize = 0;
277 sba.BindlessSamplerStateMOCS = mocs;
278 sba.BindlessSamplerStateBaseAddressModifyEnable = true;
279 #endif
280
281 if (device->physical->indirect_descriptors) {
282 sba.BindlessSurfaceStateBaseAddress =
283 (struct anv_address) { .offset =
284 device->physical->va.bindless_surface_state_pool.addr,
285 };
286 sba.BindlessSurfaceStateSize =
287 anv_physical_device_bindless_heap_size(device->physical) / ANV_SURFACE_STATE_SIZE - 1;
288 sba.BindlessSurfaceStateMOCS = mocs;
289 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
290 } else {
291 /* Bindless Surface State & Bindless Sampler State are aligned to the
292 * same heap
293 */
294 sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
295 .offset = device->physical->va.internal_surface_state_pool.addr,
296 };
297 sba.BindlessSurfaceStateSize =
298 (device->physical->va.internal_surface_state_pool.size +
299 device->physical->va.bindless_surface_state_pool.size) - 1;
300 sba.BindlessSurfaceStateMOCS = mocs;
301 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
302 }
303
304 #if GFX_VERx10 >= 125
305 sba.L1CacheControl = L1CC_WB;
306 #endif
307 }
308 #endif
309
310 #if GFX_VERx10 >= 125
311 if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
312 anv_batch_emit(batch, GENX(3DSTATE_BTD), btd) {
313 /* TODO: This is the timeout after which the bucketed thread
314 * dispatcher will kick off a wave of threads. We go with the
315 * lowest value for now. It could be tweaked on a per
316 * application basis (drirc).
317 */
318 btd.DispatchTimeoutCounter = _64clocks;
319 /* BSpec 43851: "This field must be programmed to 6h i.e. memory
320 * backed buffer must be 128KB."
321 */
322 btd.PerDSSMemoryBackedBufferSize = 6;
323 btd.MemoryBackedBufferBasePointer = (struct anv_address) {
324 /* This batch doesn't have a reloc list so we can't use the BO
325 * here. We just use the address directly.
326 */
327 .offset = device->btd_fifo_bo->offset,
328 };
329 }
330 }
331 #endif
332 }
333
334 static VkResult
init_render_queue_state(struct anv_queue * queue,bool is_companion_rcs_batch)335 init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
336 {
337 struct anv_device *device = queue->device;
338 UNUSED const struct intel_device_info *devinfo = queue->device->info;
339 uint32_t cmds[256];
340 struct anv_batch batch = {
341 .start = cmds,
342 .next = cmds,
343 .end = (void *) cmds + sizeof(cmds),
344 };
345
346 struct GENX(VERTEX_ELEMENT_STATE) empty_ve = {
347 .Valid = true,
348 .Component0Control = VFCOMP_STORE_0,
349 .Component1Control = VFCOMP_STORE_0,
350 .Component2Control = VFCOMP_STORE_0,
351 .Component3Control = VFCOMP_STORE_0,
352 };
353 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, device->empty_vs_input, &empty_ve);
354
355 genX(emit_pipeline_select)(&batch, _3D, device);
356
357 #if GFX_VER == 9
358 anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
359 cm1.FloatBlendOptimizationEnable = true;
360 cm1.FloatBlendOptimizationEnableMask = true;
361 cm1.MSCRAWHazardAvoidanceBit = true;
362 cm1.MSCRAWHazardAvoidanceBitMask = true;
363 cm1.PartialResolveDisableInVC = true;
364 cm1.PartialResolveDisableInVCMask = true;
365 }
366 #endif
367
368 anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
369
370 anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
371 rect.ClippedDrawingRectangleYMin = 0;
372 rect.ClippedDrawingRectangleXMin = 0;
373 rect.ClippedDrawingRectangleYMax = UINT16_MAX;
374 rect.ClippedDrawingRectangleXMax = UINT16_MAX;
375 rect.DrawingRectangleOriginY = 0;
376 rect.DrawingRectangleOriginX = 0;
377 }
378
379 anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
380
381 /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
382 *
383 * "3DSTATE_RASTER if used must be programmed prior to using this
384 * packet."
385 *
386 * Emit this before 3DSTATE_WM_HZ_OP below.
387 */
388 anv_batch_emit(&batch, GENX(3DSTATE_RASTER), rast) {
389 rast.APIMode = DX101;
390 }
391
392 /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
393 *
394 * "3DSTATE_MULTISAMPLE packet must be used prior to this packet to
395 * change the Number of Multisamples. This packet must not be used to
396 * change Number of Multisamples in a rendering sequence."
397 *
398 * Emit this before 3DSTATE_WM_HZ_OP below.
399 */
400 anv_batch_emit(&batch, GENX(3DSTATE_MULTISAMPLE), ms);
401
402 /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
403 * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
404 * Clear." It mentions that the packet overrides GPU state for the clear
405 * operation and needs to be reset to 0s to clear the overrides. Depending
406 * on the kernel, we may not get a context with the state for this packet
407 * zeroed. Do it ourselves just in case. We've observed this to prevent a
408 * number of GPU hangs on ICL.
409 */
410 anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
411
412 genX(emit_sample_pattern)(&batch, NULL);
413
414 #if GFX_VER == 11
415 /* The default behavior of bit 5 "Headerless Message for Pre-emptable
416 * Contexts" in SAMPLER MODE register is set to 0, which means
417 * headerless sampler messages are not allowed for pre-emptable
418 * contexts. Set the bit 5 to 1 to allow them.
419 */
420 anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
421 sm.HeaderlessMessageforPreemptableContexts = true;
422 sm.HeaderlessMessageforPreemptableContextsMask = true;
423 }
424
425 /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
426 * HALF_SLICE_CHICKEN7 register.
427 */
428 anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
429 hsc7.EnabledTexelOffsetPrecisionFix = true;
430 hsc7.EnabledTexelOffsetPrecisionFixMask = true;
431 }
432
433 anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
434 tcc.L3DataPartialWriteMergingEnable = true;
435 tcc.ColorZPartialWriteMergingEnable = true;
436 tcc.URBPartialWriteMergingEnable = true;
437 tcc.TCDisable = true;
438 }
439 #endif
440 genX(emit_slice_hashing_state)(device, &batch);
441
442 #if GFX_VER >= 11
443 /* hardware specification recommends disabling repacking for
444 * the compatibility with decompression mechanism in display controller.
445 */
446 if (device->info->disable_ccs_repack) {
447 anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
448 cm0.DisableRepackingforCompression = true;
449 cm0.DisableRepackingforCompressionMask = true;
450 }
451 }
452
453 /* an unknown issue is causing vs push constants to become
454 * corrupted during object-level preemption. For now, restrict
455 * to command buffer level preemption to avoid rendering
456 * corruption.
457 */
458 anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
459 cc1.ReplayMode = MidcmdbufferPreemption;
460 cc1.ReplayModeMask = true;
461
462 #if GFX_VERx10 == 120
463 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
464 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
465 #endif
466 }
467
468 #if INTEL_NEEDS_WA_1806527549
469 /* Wa_1806527549 says to disable the following HiZ optimization when the
470 * depth buffer is D16_UNORM. We've found the WA to help with more depth
471 * buffer configurations however, so we always disable it just to be safe.
472 */
473 anv_batch_write_reg(&batch, GENX(HIZ_CHICKEN), reg) {
474 reg.HZDepthTestLEGEOptimizationDisable = true;
475 reg.HZDepthTestLEGEOptimizationDisableMask = true;
476 }
477 #endif
478
479 #if GFX_VER == 12
480 anv_batch_write_reg(&batch, GENX(FF_MODE2), reg) {
481 /* On Alchemist, the FF_MODE2 docs for the GS timer say:
482 *
483 * "The timer value must be set to 224."
484 *
485 * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
486 * and that this is necessary to avoid hanging the HS/DS units. It
487 * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
488 *
489 * The HS timer docs also have the same quote for Alchemist. I am
490 * unaware of a reason it needs to be set to 224 on Tigerlake, but
491 * we do so for consistency if nothing else.
492 *
493 * For the TDS timer value, the docs say:
494 *
495 * "For best performance, a value of 4 should be programmed."
496 *
497 * i915 also sets it this way on Tigerlake due to workarounds.
498 *
499 * The default VS timer appears to be 0, so we leave it at that.
500 */
501 reg.GSTimerValue = 224;
502 reg.HSTimerValue = 224;
503 reg.TDSTimerValue = 4;
504 reg.VSTimerValue = 0;
505 }
506 #endif
507
508 #if INTEL_NEEDS_WA_1508744258
509 /* Disable RHWO by setting 0x7010[14] by default except during resolve
510 * pass.
511 *
512 * We implement global disabling of the optimization here and we toggle it
513 * in anv_image_ccs_op().
514 */
515 anv_batch_write_reg(&batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
516 c1.RCCRHWOOptimizationDisable = true;
517 c1.RCCRHWOOptimizationDisableMask = true;
518 }
519 #endif
520
521 #if GFX_VERx10 < 125
522 #define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
523 #else
524 #define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
525 #endif
526
527 /* Enable the new line drawing algorithm that produces higher quality
528 * lines.
529 */
530 anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
531 c3.AALineQualityFix = true;
532 c3.AALineQualityFixMask = true;
533 }
534 #endif
535
536 #if GFX_VER == 12
537 if (device->info->has_aux_map) {
538 uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
539 assert(aux_base_addr % (32 * 1024) == 0);
540 anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
541 lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
542 lri.DataDWord = aux_base_addr & 0xffffffff;
543 }
544 anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
545 lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
546 lri.DataDWord = aux_base_addr >> 32;
547 }
548 }
549 #endif
550
551 #if GFX_VERx10 == 125
552 anv_batch_write_reg(&batch, GENX(CHICKEN_RASTER_2), reg) {
553 reg.TBIMRBatchSizeOverride = true;
554 reg.TBIMROpenBatchEnable = true;
555 reg.TBIMRFastClip = true;
556 reg.TBIMRBatchSizeOverrideMask = true;
557 reg.TBIMROpenBatchEnableMask = true;
558 reg.TBIMRFastClipMask = true;
559 }
560 #endif
561
562 /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
563 * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
564 *
565 * This is only safe on kernels with context isolation support.
566 */
567 assert(device->physical->info.has_context_isolation);
568 anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
569 csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
570 csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
571 }
572
573 init_common_queue_state(queue, &batch);
574
575 /* Because 3DSTATE_CPS::CoarsePixelShadingStateArrayPointer is relative to
576 * the dynamic state base address we need to emit this instruction after
577 * STATE_BASE_ADDRESS in init_common_queue_state().
578 */
579 #if GFX_VER == 11
580 anv_batch_emit(&batch, GENX(3DSTATE_CPS), cps);
581 #elif GFX_VER >= 12
582 anv_batch_emit(&batch, GENX(3DSTATE_CPS_POINTERS), cps) {
583 assert(device->cps_states.alloc_size != 0);
584 /* Offset 0 is the disabled state */
585 cps.CoarsePixelShadingStateArrayPointer =
586 device->cps_states.offset;
587 }
588 #endif
589
590 #if GFX_VERx10 >= 125
591 anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), zero);
592 anv_batch_emit(&batch, GENX(3DSTATE_MESH_CONTROL), zero);
593 anv_batch_emit(&batch, GENX(3DSTATE_TASK_CONTROL), zero);
594
595 /* We no longer required to explicitly flush or invalidate caches since the
596 * PIPELINE_SELECT is getting deprecated on Xe2+.
597 */
598 #if GFX_VER < 20
599 genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
600 ANV_NULL_ADDRESS,
601 0,
602 ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
603 #endif
604
605 genX(emit_pipeline_select)(&batch, GPGPU, device);
606 anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
607 cfe.MaximumNumberofThreads =
608 devinfo->max_cs_threads * devinfo->subslice_total;
609 }
610
611 /* We no longer required to explicitly flush or invalidate caches since the
612 * PIPELINE_SELECT is getting deprecated on Xe2+.
613 */
614 #if GFX_VER < 20
615 genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
616 ANV_NULL_ADDRESS,
617 0,
618 ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
619 #endif
620
621 genX(emit_pipeline_select)(&batch, _3D, device);
622 #endif
623
624 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
625
626 assert(batch.next <= batch.end);
627
628 if (!device->trtt.queue)
629 device->trtt.queue = queue;
630
631 return anv_queue_submit_simple_batch(queue, &batch, is_companion_rcs_batch);
632 }
633
634 static VkResult
init_compute_queue_state(struct anv_queue * queue)635 init_compute_queue_state(struct anv_queue *queue)
636 {
637 UNUSED const struct intel_device_info *devinfo = queue->device->info;
638 uint32_t cmds[64];
639 struct anv_batch batch = {
640 .start = cmds,
641 .next = cmds,
642 .end = (void *) cmds + sizeof(cmds),
643 };
644
645 genX(emit_pipeline_select)(&batch, GPGPU, queue->device);
646
647 #if GFX_VER == 12
648 if (queue->device->info->has_aux_map) {
649 uint64_t aux_base_addr =
650 intel_aux_map_get_base(queue->device->aux_map_ctx);
651 assert(aux_base_addr % (32 * 1024) == 0);
652 anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
653 lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
654 lri.DataDWord = aux_base_addr & 0xffffffff;
655 }
656 anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
657 lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num) + 4;
658 lri.DataDWord = aux_base_addr >> 32;
659 }
660 }
661 #else
662 assert(!queue->device->info->has_aux_map);
663 #endif
664
665 /* Wa_14015782607 - Issue pipe control with HDC_flush and
666 * untyped cache flush set to 1 when CCS has NP state update with
667 * STATE_COMPUTE_MODE.
668 */
669 if (intel_needs_workaround(devinfo, 14015782607) &&
670 queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
671 genx_batch_emit_pipe_control(&batch, devinfo, GPGPU,
672 ANV_PIPE_CS_STALL_BIT |
673 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
674 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
675 }
676
677 #if GFX_VERx10 >= 125
678 /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
679 * emitting NP state commands with ATS-M in compute mode.
680 */
681 if (intel_device_info_is_atsm(devinfo) &&
682 queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
683 genx_batch_emit_pipe_control
684 (&batch, devinfo, GPGPU,
685 ANV_PIPE_CS_STALL_BIT |
686 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
687 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
688 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
689 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
690 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
691 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
692 }
693
694 anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), cm) {
695 cm.PixelAsyncComputeThreadLimit = 4;
696 cm.PixelAsyncComputeThreadLimitMask = 0x7;
697 }
698 #endif
699
700 init_common_queue_state(queue, &batch);
701
702 #if GFX_VERx10 >= 125
703 anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
704 cfe.MaximumNumberofThreads =
705 devinfo->max_cs_threads * devinfo->subslice_total;
706 }
707 #endif
708
709 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
710
711 assert(batch.next <= batch.end);
712
713 return anv_queue_submit_simple_batch(queue, &batch,
714 false /* is_companion_rcs_batch */);
715 }
716
717 static VkResult
init_copy_video_queue_state(struct anv_queue * queue)718 init_copy_video_queue_state(struct anv_queue *queue)
719 {
720 #if GFX_VER >= 12
721 UNUSED const struct intel_device_info *devinfo = queue->device->info;
722 uint32_t cmds[64];
723 UNUSED struct anv_batch batch = {
724 .start = cmds,
725 .next = cmds,
726 .end = (void *) cmds + sizeof(cmds),
727 };
728
729 if (queue->device->info->has_aux_map) {
730 uint64_t reg = GENX(VD0_AUX_TABLE_BASE_ADDR_num);
731
732 if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY) {
733 #if GFX_VERx10 >= 125
734 reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
735 #endif
736 }
737
738 uint64_t aux_base_addr =
739 intel_aux_map_get_base(queue->device->aux_map_ctx);
740 assert(aux_base_addr % (32 * 1024) == 0);
741 anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
742 lri.RegisterOffset = reg;
743 lri.DataDWord = aux_base_addr & 0xffffffff;
744 }
745 anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
746 lri.RegisterOffset = reg + 4;
747 lri.DataDWord = aux_base_addr >> 32;
748 }
749
750 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
751 assert(batch.next <= batch.end);
752
753 return anv_queue_submit_simple_batch(queue, &batch,
754 false /* is_companion_rcs_batch */);
755 }
756 #else
757 assert(!queue->device->info->has_aux_map);
758 #endif
759
760 return VK_SUCCESS;
761 }
762
763 void
genX(init_physical_device_state)764 genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
765 {
766 assert(pdevice->info.verx10 == GFX_VERx10);
767 #if GFX_VERx10 == 125 && ANV_SUPPORT_RT
768 genX(grl_load_rt_uuid)(pdevice->rt_uuid);
769 pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
770 #endif
771
772 pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
773 }
774
775 VkResult
genX(init_device_state)776 genX(init_device_state)(struct anv_device *device)
777 {
778 VkResult res;
779
780 device->slice_hash = (struct anv_state) { 0 };
781 for (uint32_t i = 0; i < device->queue_count; i++) {
782 struct anv_queue *queue = &device->queues[i];
783 switch (queue->family->engine_class) {
784 case INTEL_ENGINE_CLASS_RENDER:
785 res = init_render_queue_state(queue, false /* is_companion_rcs_batch */);
786 break;
787 case INTEL_ENGINE_CLASS_COMPUTE: {
788 res = init_compute_queue_state(queue);
789 if (res != VK_SUCCESS)
790 return res;
791
792 /**
793 * Execute RCS init batch by default on the companion RCS command buffer in
794 * order to support MSAA copy/clear operations on compute queue.
795 */
796 res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
797 break;
798 }
799 case INTEL_ENGINE_CLASS_VIDEO:
800 res = init_copy_video_queue_state(queue);
801 break;
802 case INTEL_ENGINE_CLASS_COPY:
803 res = init_copy_video_queue_state(queue);
804 if (res != VK_SUCCESS)
805 return res;
806
807 /**
808 * Execute RCS init batch by default on the companion RCS command buffer in
809 * order to support MSAA copy/clear operations on copy queue.
810 */
811 res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
812 break;
813 default:
814 res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
815 break;
816 }
817 if (res != VK_SUCCESS)
818 return res;
819 }
820
821 return res;
822 }
823
824 #if GFX_VERx10 >= 125
825 #define maybe_for_each_shading_rate_op(name) \
826 for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
827 name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
828 name++)
829 #elif GFX_VER >= 12
830 #define maybe_for_each_shading_rate_op(name)
831 #endif
832
833 /* Rather than reemitting the CPS_STATE structure everything those changes and
834 * for as many viewports as needed, we can just prepare all possible cases and
835 * just pick the right offset from the prepacked states when needed.
836 */
837 void
genX(init_cps_device_state)838 genX(init_cps_device_state)(struct anv_device *device)
839 {
840 #if GFX_VER >= 12
841 void *cps_state_ptr = device->cps_states.map;
842
843 /* Disabled CPS mode */
844 for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
845 /* ICL PRMs, Volume 2d: Command Reference: Structures: 3DSTATE_CPS_BODY:
846 *
847 * "It is an INVALID configuration to set the CPS mode other than
848 * CPS_MODE_NONE and request per-sample dispatch in 3DSTATE_PS_EXTRA.
849 * Such configuration should be disallowed at the API level, and
850 * rendering results are undefined."
851 *
852 * Since we select this state when per coarse pixel is disabled and that
853 * includes when per-sample dispatch is enabled, we need to ensure this
854 * is set to NONE.
855 */
856 struct GENX(CPS_STATE) cps_state = {
857 .CoarsePixelShadingMode = CPS_MODE_NONE,
858 };
859
860 GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
861 cps_state_ptr += GENX(CPS_STATE_length) * 4;
862 }
863
864 maybe_for_each_shading_rate_op(op0) {
865 maybe_for_each_shading_rate_op(op1) {
866 for (uint32_t x = 1; x <= 4; x *= 2) {
867 for (uint32_t y = 1; y <= 4; y *= 2) {
868 struct GENX(CPS_STATE) cps_state = {
869 .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
870 .MinCPSizeX = x,
871 .MinCPSizeY = y,
872 };
873
874 #if GFX_VERx10 >= 125
875 static const uint32_t combiner_ops[] = {
876 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR] = PASSTHROUGH,
877 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
878 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR] = HIGH_QUALITY,
879 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR] = LOW_QUALITY,
880 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR] = RELATIVE,
881 };
882
883 cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
884 cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
885 #endif /* GFX_VERx10 >= 125 */
886
887 for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
888 GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
889 cps_state_ptr += GENX(CPS_STATE_length) * 4;
890 }
891 }
892 }
893 }
894 }
895 #endif /* GFX_VER >= 12 */
896 }
897
898 void
genX(emit_l3_config)899 genX(emit_l3_config)(struct anv_batch *batch,
900 const struct anv_device *device,
901 const struct intel_l3_config *cfg)
902 {
903 #if GFX_VER < 20
904 UNUSED const struct intel_device_info *devinfo = device->info;
905
906 #if GFX_VER >= 12
907 #define L3_ALLOCATION_REG GENX(L3ALLOC)
908 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
909 #else
910 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
911 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
912 #endif
913
914 anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
915 if (cfg == NULL || (GFX_VER >= 12 && cfg->n[INTEL_L3P_ALL] > 126)) {
916 assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
917 cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
918 cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
919 cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
920 #if GFX_VER >= 12
921 l3cr.L3FullWayAllocationEnable = true;
922 #else
923 unreachable("Invalid L3$ config");
924 #endif
925 } else {
926 #if GFX_VER < 11
927 l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
928 #endif
929 #if INTEL_NEEDS_WA_1406697149
930 /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
931 * set in L3CNTLREG register. The default setting of the bit is not
932 * the desirable behavior.
933 */
934 l3cr.ErrorDetectionBehaviorControl = true;
935 l3cr.UseFullWays = true;
936 #endif /* INTEL_NEEDS_WA_1406697149 */
937 assert(cfg->n[INTEL_L3P_IS] == 0);
938 assert(cfg->n[INTEL_L3P_C] == 0);
939 assert(cfg->n[INTEL_L3P_T] == 0);
940 l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
941 l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
942 l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
943 l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
944 }
945 }
946 #endif /* GFX_VER < 20 */
947 }
948
949 void
genX(emit_sample_pattern)950 genX(emit_sample_pattern)(struct anv_batch *batch,
951 const struct vk_sample_locations_state *sl)
952 {
953 assert(sl == NULL || sl->grid_size.width == 1);
954 assert(sl == NULL || sl->grid_size.height == 1);
955
956 /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
957 * VkPhysicalDeviceFeatures::standardSampleLocations.
958 */
959 anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
960 /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
961 *
962 * "When programming the sample offsets (for NUMSAMPLES_4 or _8
963 * and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
964 * (or 7 for 8X, or 15 for 16X) must have monotonically increasing
965 * distance from the pixel center. This is required to get the
966 * correct centroid computation in the device."
967 *
968 * However, the Vulkan spec seems to require that the the samples occur
969 * in the order provided through the API. The standard sample patterns
970 * have the above property that they have monotonically increasing
971 * distances from the center but client-provided ones do not. As long as
972 * this only affects centroid calculations as the docs say, we should be
973 * ok because OpenGL and Vulkan only require that the centroid be some
974 * lit sample and that it's the same for all samples in a pixel; they
975 * have no requirement that it be the one closest to center.
976 */
977 for (uint32_t i = 1; i <= 16; i *= 2) {
978 switch (i) {
979 case VK_SAMPLE_COUNT_1_BIT:
980 if (sl && sl->per_pixel == i) {
981 INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
982 } else {
983 INTEL_SAMPLE_POS_1X(sp._1xSample);
984 }
985 break;
986 case VK_SAMPLE_COUNT_2_BIT:
987 if (sl && sl->per_pixel == i) {
988 INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
989 } else {
990 INTEL_SAMPLE_POS_2X(sp._2xSample);
991 }
992 break;
993 case VK_SAMPLE_COUNT_4_BIT:
994 if (sl && sl->per_pixel == i) {
995 INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
996 } else {
997 INTEL_SAMPLE_POS_4X(sp._4xSample);
998 }
999 break;
1000 case VK_SAMPLE_COUNT_8_BIT:
1001 if (sl && sl->per_pixel == i) {
1002 INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
1003 } else {
1004 INTEL_SAMPLE_POS_8X(sp._8xSample);
1005 }
1006 break;
1007 case VK_SAMPLE_COUNT_16_BIT:
1008 if (sl && sl->per_pixel == i) {
1009 INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
1010 } else {
1011 INTEL_SAMPLE_POS_16X(sp._16xSample);
1012 }
1013 break;
1014 default:
1015 unreachable("Invalid sample count");
1016 }
1017 }
1018 }
1019 }
1020
1021 static uint32_t
vk_to_intel_tex_filter(VkFilter filter,bool anisotropyEnable)1022 vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
1023 {
1024 switch (filter) {
1025 default:
1026 unreachable("Invalid filter");
1027 case VK_FILTER_NEAREST:
1028 return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
1029 case VK_FILTER_LINEAR:
1030 return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
1031 }
1032 }
1033
1034 static uint32_t
vk_to_intel_max_anisotropy(float ratio)1035 vk_to_intel_max_anisotropy(float ratio)
1036 {
1037 return (CLAMP(ratio, 2, 16) - 2) / 2;
1038 }
1039
1040 static const uint32_t vk_to_intel_mipmap_mode[] = {
1041 [VK_SAMPLER_MIPMAP_MODE_NEAREST] = MIPFILTER_NEAREST,
1042 [VK_SAMPLER_MIPMAP_MODE_LINEAR] = MIPFILTER_LINEAR
1043 };
1044
1045 static const uint32_t vk_to_intel_tex_address[] = {
1046 [VK_SAMPLER_ADDRESS_MODE_REPEAT] = TCM_WRAP,
1047 [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
1048 [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE] = TCM_CLAMP,
1049 [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
1050 [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
1051 };
1052
1053 /* Vulkan specifies the result of shadow comparisons as:
1054 * 1 if ref <op> texel,
1055 * 0 otherwise.
1056 *
1057 * The hardware does:
1058 * 0 if texel <op> ref,
1059 * 1 otherwise.
1060 *
1061 * So, these look a bit strange because there's both a negation
1062 * and swapping of the arguments involved.
1063 */
1064 static const uint32_t vk_to_intel_shadow_compare_op[] = {
1065 [VK_COMPARE_OP_NEVER] = PREFILTEROP_ALWAYS,
1066 [VK_COMPARE_OP_LESS] = PREFILTEROP_LEQUAL,
1067 [VK_COMPARE_OP_EQUAL] = PREFILTEROP_NOTEQUAL,
1068 [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LESS,
1069 [VK_COMPARE_OP_GREATER] = PREFILTEROP_GEQUAL,
1070 [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_EQUAL,
1071 [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GREATER,
1072 [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_NEVER,
1073 };
1074
1075 static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
1076 [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
1077 [VK_SAMPLER_REDUCTION_MODE_MIN] = MINIMUM,
1078 [VK_SAMPLER_REDUCTION_MODE_MAX] = MAXIMUM,
1079 };
1080
genX(CreateSampler)1081 VkResult genX(CreateSampler)(
1082 VkDevice _device,
1083 const VkSamplerCreateInfo* pCreateInfo,
1084 const VkAllocationCallbacks* pAllocator,
1085 VkSampler* pSampler)
1086 {
1087 ANV_FROM_HANDLE(anv_device, device, _device);
1088 struct anv_sampler *sampler;
1089
1090 sampler = vk_sampler_create(&device->vk, pCreateInfo,
1091 pAllocator, sizeof(*sampler));
1092 if (!sampler)
1093 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1094
1095 const struct vk_format_ycbcr_info *ycbcr_info =
1096 sampler->vk.format != VK_FORMAT_UNDEFINED ?
1097 vk_format_get_ycbcr_info(sampler->vk.format) : NULL;
1098 assert((ycbcr_info == NULL) == (sampler->vk.ycbcr_conversion == NULL));
1099
1100 sampler->n_planes = ycbcr_info ? ycbcr_info->n_planes : 1;
1101
1102 uint32_t border_color_stride = 64;
1103 uint32_t border_color_offset;
1104 if (sampler->vk.border_color <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
1105 border_color_offset = device->border_colors.offset +
1106 pCreateInfo->borderColor *
1107 border_color_stride;
1108 } else {
1109 assert(vk_border_color_is_custom(sampler->vk.border_color));
1110 sampler->custom_border_color =
1111 anv_state_reserved_pool_alloc(&device->custom_border_colors);
1112 border_color_offset = sampler->custom_border_color.offset;
1113
1114 union isl_color_value color = { .u32 = {
1115 sampler->vk.border_color_value.uint32[0],
1116 sampler->vk.border_color_value.uint32[1],
1117 sampler->vk.border_color_value.uint32[2],
1118 sampler->vk.border_color_value.uint32[3],
1119 } };
1120
1121 const struct anv_format *format_desc =
1122 sampler->vk.format != VK_FORMAT_UNDEFINED ?
1123 anv_get_format(sampler->vk.format) : NULL;
1124
1125 if (format_desc && format_desc->n_planes == 1 &&
1126 !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
1127 const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
1128
1129 assert(!isl_format_has_int_channel(fmt_plane->isl_format));
1130 color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
1131 }
1132
1133 memcpy(sampler->custom_border_color.map, color.u32, sizeof(color));
1134 }
1135
1136 /* If we have bindless, allocate enough samplers. We allocate 32 bytes
1137 * for each sampler instead of 16 bytes because we want all bindless
1138 * samplers to be 32-byte aligned so we don't have to use indirect
1139 * sampler messages on them.
1140 */
1141 sampler->bindless_state =
1142 anv_state_pool_alloc(&device->dynamic_state_pool,
1143 sampler->n_planes * 32, 32);
1144
1145 const bool seamless_cube =
1146 !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
1147
1148 for (unsigned p = 0; p < sampler->n_planes; p++) {
1149 const bool plane_has_chroma =
1150 ycbcr_info && ycbcr_info->planes[p].has_chroma;
1151 const VkFilter min_filter =
1152 plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1153 pCreateInfo->minFilter;
1154 const VkFilter mag_filter =
1155 plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
1156 pCreateInfo->magFilter;
1157 const bool force_addr_rounding =
1158 device->physical->instance->force_filter_addr_rounding;
1159 const bool enable_min_filter_addr_rounding =
1160 force_addr_rounding || min_filter != VK_FILTER_NEAREST;
1161 const bool enable_mag_filter_addr_rounding =
1162 force_addr_rounding || mag_filter != VK_FILTER_NEAREST;
1163 /* From Broadwell PRM, SAMPLER_STATE:
1164 * "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
1165 */
1166 enum isl_format plane0_isl_format = sampler->vk.ycbcr_conversion ?
1167 anv_get_format(sampler->vk.format)->planes[0].isl_format :
1168 ISL_FORMAT_UNSUPPORTED;
1169 const bool isl_format_is_planar_yuv =
1170 plane0_isl_format != ISL_FORMAT_UNSUPPORTED &&
1171 isl_format_is_yuv(plane0_isl_format) &&
1172 isl_format_is_planar(plane0_isl_format);
1173
1174 const uint32_t mip_filter_mode =
1175 isl_format_is_planar_yuv ?
1176 MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
1177
1178 struct GENX(SAMPLER_STATE) sampler_state = {
1179 .SamplerDisable = false,
1180 .TextureBorderColorMode = DX10OGL,
1181
1182 #if GFX_VER >= 11
1183 .CPSLODCompensationEnable = true,
1184 #endif
1185
1186 .LODPreClampMode = CLAMP_MODE_OGL,
1187
1188 .MipModeFilter = mip_filter_mode,
1189 .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
1190 .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
1191 .TextureLODBias = CLAMP(pCreateInfo->mipLodBias, -16, 15.996),
1192 .AnisotropicAlgorithm =
1193 pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
1194 .MinLOD = CLAMP(pCreateInfo->minLod, 0, 14),
1195 .MaxLOD = CLAMP(pCreateInfo->maxLod, 0, 14),
1196 .ChromaKeyEnable = 0,
1197 .ChromaKeyIndex = 0,
1198 .ChromaKeyMode = 0,
1199 .ShadowFunction =
1200 vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
1201 pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
1202 .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
1203
1204 .BorderColorPointer = border_color_offset,
1205
1206 .LODClampMagnificationMode = MIPNONE,
1207
1208 .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
1209 .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1210 .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1211 .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1212 .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1213 .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1214 .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1215 .TrilinearFilterQuality = 0,
1216 .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
1217 .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
1218 .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
1219 .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
1220
1221 .ReductionType =
1222 vk_to_intel_sampler_reduction_mode[sampler->vk.reduction_mode],
1223 .ReductionTypeEnable =
1224 sampler->vk.reduction_mode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE,
1225 };
1226
1227 GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
1228
1229 if (sampler->bindless_state.map) {
1230 memcpy(sampler->bindless_state.map + p * 32,
1231 sampler->state[p], GENX(SAMPLER_STATE_length) * 4);
1232 }
1233 }
1234
1235 *pSampler = anv_sampler_to_handle(sampler);
1236
1237 return VK_SUCCESS;
1238 }
1239
1240 /* Wa_14015814527
1241 *
1242 * Check if task shader was utilized within cmd_buffer, if so
1243 * commit empty URB states and null prim.
1244 */
1245 void
genX(apply_task_urb_workaround)1246 genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
1247 {
1248 if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
1249 return;
1250
1251 #if GFX_VERx10 >= 125
1252 const struct intel_device_info *devinfo = &cmd_buffer->device->physical->info;
1253
1254 if (!intel_needs_workaround(devinfo, 16014390852))
1255 return;
1256
1257 if (cmd_buffer->state.current_pipeline != _3D ||
1258 !cmd_buffer->state.gfx.used_task_shader)
1259 return;
1260
1261 cmd_buffer->state.gfx.used_task_shader = false;
1262
1263 /* Wa_14015821291 mentions that WA below is not required if we have
1264 * a pipeline flush going on. It will get flushed during
1265 * cmd_buffer_flush_state before draw.
1266 */
1267 if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_CS_STALL_BIT))
1268 return;
1269
1270 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
1271 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
1272 urb._3DCommandSubOpcode += i;
1273 }
1274 }
1275
1276 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
1277 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
1278
1279 /* Issue 'nullprim' to commit the state. */
1280 genx_batch_emit_pipe_control_write
1281 (&cmd_buffer->batch, cmd_buffer->device->info,
1282 cmd_buffer->state.current_pipeline,
1283 WriteImmediateData, cmd_buffer->device->workaround_address, 0, 0);
1284 #endif
1285 }
1286
1287 VkResult
genX(init_trtt_context_state)1288 genX(init_trtt_context_state)(struct anv_queue *queue)
1289 {
1290 #if GFX_VER >= 12
1291 struct anv_device *device = queue->device;
1292 struct anv_trtt *trtt = &device->trtt;
1293
1294 uint32_t cmds[128];
1295 struct anv_batch batch = {
1296 .start = cmds,
1297 .next = cmds,
1298 .end = (void *)cmds + sizeof(cmds),
1299 };
1300
1301 anv_batch_write_reg(&batch, GENX(GFX_TRTT_INVAL), trtt_inval) {
1302 trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
1303 }
1304 anv_batch_write_reg(&batch, GENX(GFX_TRTT_NULL), trtt_null) {
1305 trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
1306 }
1307 anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
1308 trtt_va_range.TRVAMaskValue = 0xF;
1309 trtt_va_range.TRVADataValue = 0xF;
1310 }
1311
1312 uint64_t l3_addr = trtt->l3_addr;
1313 assert((l3_addr & 0xFFF) == 0);
1314 anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) {
1315 trtt_base_low.TRVAL3PointerLowerAddress =
1316 (l3_addr & 0xFFFFF000) >> 12;
1317 }
1318 anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_HIGH),
1319 trtt_base_high) {
1320 trtt_base_high.TRVAL3PointerUpperAddress =
1321 (l3_addr >> 32) & 0xFFFF;
1322 }
1323 /* Enabling TR-TT needs to be done after setting up the other registers.
1324 */
1325 anv_batch_write_reg(&batch, GENX(GFX_TRTT_CR), trtt_cr) {
1326 trtt_cr.TRTTEnable = true;
1327 }
1328
1329 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
1330 assert(batch.next <= batch.end);
1331
1332 VkResult res = anv_queue_submit_simple_batch(queue, &batch, false);
1333 if (res != VK_SUCCESS)
1334 return res;
1335
1336 #endif
1337 return VK_SUCCESS;
1338 }
1339