1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "common/intel_aux_map.h"
33 #include "common/intel_sample_positions.h"
34 #include "common/intel_pixel_hash.h"
35 #include "genxml/gen_macros.h"
36 #include "genxml/genX_pack.h"
37
38 #include "vk_standard_sample_locations.h"
39 #include "vk_util.h"
40
41 static void
genX(emit_slice_hashing_state)42 genX(emit_slice_hashing_state)(struct anv_device *device,
43 struct anv_batch *batch)
44 {
45 #if GFX_VER == 11
46 /* Gfx11 hardware has two pixel pipes at most. */
47 for (unsigned i = 2; i < ARRAY_SIZE(device->info.ppipe_subslices); i++)
48 assert(device->info.ppipe_subslices[i] == 0);
49
50 if (device->info.ppipe_subslices[0] == device->info.ppipe_subslices[1])
51 return;
52
53 if (!device->slice_hash.alloc_size) {
54 unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
55 device->slice_hash =
56 anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
57
58 const bool flip = device->info.ppipe_subslices[0] <
59 device->info.ppipe_subslices[1];
60 struct GENX(SLICE_HASH_TABLE) table;
61 intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
62
63 GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
64 }
65
66 anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
67 ptr.SliceHashStatePointerValid = true;
68 ptr.SliceHashTableStatePointer = device->slice_hash.offset;
69 }
70
71 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
72 mode.SliceHashingTableEnable = true;
73 }
74 #elif GFX_VERx10 == 120
75 /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
76 * present with n active dual subslices.
77 */
78 unsigned ppipes_of[3] = {};
79
80 for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
81 for (unsigned p = 0; p < 3; p++)
82 ppipes_of[n] += (device->info.ppipe_subslices[p] == n);
83 }
84
85 /* Gfx12 has three pixel pipes. */
86 for (unsigned p = 3; p < ARRAY_SIZE(device->info.ppipe_subslices); p++)
87 assert(device->info.ppipe_subslices[p] == 0);
88
89 if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
90 /* All three pixel pipes have the maximum number of active dual
91 * subslices, or there is only one active pixel pipe: Nothing to do.
92 */
93 return;
94 }
95
96 anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
97 p.SliceHashControl[0] = TABLE_0;
98
99 if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
100 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
101 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
102 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
103
104 if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
105 intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
106 else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
107 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
108 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
109 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
110 else
111 unreachable("Illegal fusing.");
112 }
113
114 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
115 p.SubsliceHashingTableEnable = true;
116 p.SubsliceHashingTableEnableMask = true;
117 }
118 #elif GFX_VERx10 == 125
119 uint32_t ppipe_mask = 0;
120 for (unsigned p = 0; p < ARRAY_SIZE(device->info.ppipe_subslices); p++) {
121 if (device->info.ppipe_subslices[p])
122 ppipe_mask |= (1u << p);
123 }
124 assert(ppipe_mask);
125
126 if (!device->slice_hash.alloc_size) {
127 unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
128 device->slice_hash =
129 anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
130
131 struct GENX(SLICE_HASH_TABLE) table;
132
133 /* Note that the hardware expects an array with 7 tables, each
134 * table is intended to specify the pixel pipe hashing behavior
135 * for every possible slice count between 2 and 8, however that
136 * doesn't actually work, among other reasons due to hardware
137 * bugs that will cause the GPU to erroneously access the table
138 * at the wrong index in some cases, so in practice all 7 tables
139 * need to be initialized to the same value.
140 */
141 for (unsigned i = 0; i < 7; i++)
142 intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]);
143
144 GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
145 }
146
147 anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
148 ptr.SliceHashStatePointerValid = true;
149 ptr.SliceHashTableStatePointer = device->slice_hash.offset;
150 }
151
152 anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
153 mode.SliceHashingTableEnable = true;
154 mode.SliceHashingTableEnableMask = true;
155 mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
156 hashing32x32 : NormalMode);
157 mode.CrossSliceHashingModeMask = -1;
158 }
159 #endif
160 }
161
162 static void
init_common_queue_state(struct anv_queue * queue,struct anv_batch * batch)163 init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
164 {
165 UNUSED struct anv_device *device = queue->device;
166
167 #if GFX_VER >= 11
168 /* Starting with GFX version 11, SLM is no longer part of the L3$ config
169 * so it never changes throughout the lifetime of the VkDevice.
170 */
171 const struct intel_l3_config *cfg = intel_get_default_l3_config(&device->info);
172 genX(emit_l3_config)(batch, device, cfg);
173 device->l3_config = cfg;
174 #endif
175
176 #if GFX_VERx10 >= 125
177 /* GEN:BUG:1607854226:
178 *
179 * Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
180 * Fortunately, we always start the context off in 3D mode.
181 */
182 uint32_t mocs = device->isl_dev.mocs.internal;
183 anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
184 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
185 sba.GeneralStateBufferSize = 0xfffff;
186 sba.GeneralStateMOCS = mocs;
187 sba.GeneralStateBaseAddressModifyEnable = true;
188 sba.GeneralStateBufferSizeModifyEnable = true;
189
190 sba.StatelessDataPortAccessMOCS = mocs;
191
192 sba.SurfaceStateBaseAddress =
193 (struct anv_address) { .offset = SURFACE_STATE_POOL_MIN_ADDRESS };
194 sba.SurfaceStateMOCS = mocs;
195 sba.SurfaceStateBaseAddressModifyEnable = true;
196
197 sba.DynamicStateBaseAddress =
198 (struct anv_address) { .offset = DYNAMIC_STATE_POOL_MIN_ADDRESS };
199 sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;
200 sba.DynamicStateMOCS = mocs;
201 sba.DynamicStateBaseAddressModifyEnable = true;
202 sba.DynamicStateBufferSizeModifyEnable = true;
203
204 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
205 sba.IndirectObjectBufferSize = 0xfffff;
206 sba.IndirectObjectMOCS = mocs;
207 sba.IndirectObjectBaseAddressModifyEnable = true;
208 sba.IndirectObjectBufferSizeModifyEnable = true;
209
210 sba.InstructionBaseAddress =
211 (struct anv_address) { .offset = INSTRUCTION_STATE_POOL_MIN_ADDRESS };
212 sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;
213 sba.InstructionMOCS = mocs;
214 sba.InstructionBaseAddressModifyEnable = true;
215 sba.InstructionBuffersizeModifyEnable = true;
216
217 sba.BindlessSurfaceStateBaseAddress =
218 (struct anv_address) { .offset = SURFACE_STATE_POOL_MIN_ADDRESS };
219 sba.BindlessSurfaceStateSize = (1 << 20) - 1;
220 sba.BindlessSurfaceStateMOCS = mocs;
221 sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
222
223 sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
224 sba.BindlessSamplerStateMOCS = mocs;
225 sba.BindlessSamplerStateBaseAddressModifyEnable = true;
226 sba.BindlessSamplerStateBufferSize = 0;
227 }
228 #endif
229 }
230
231 static VkResult
init_render_queue_state(struct anv_queue * queue)232 init_render_queue_state(struct anv_queue *queue)
233 {
234 struct anv_device *device = queue->device;
235 uint32_t cmds[128];
236 struct anv_batch batch = {
237 .start = cmds,
238 .next = cmds,
239 .end = (void *) cmds + sizeof(cmds),
240 };
241
242 anv_batch_emit(&batch, GENX(PIPELINE_SELECT), ps) {
243 #if GFX_VER >= 9
244 ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
245 ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
246 #endif
247 ps.PipelineSelection = _3D;
248 }
249
250 #if GFX_VER == 9
251 anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
252 cm1.FloatBlendOptimizationEnable = true;
253 cm1.FloatBlendOptimizationEnableMask = true;
254 cm1.MSCRAWHazardAvoidanceBit = true;
255 cm1.MSCRAWHazardAvoidanceBitMask = true;
256 cm1.PartialResolveDisableInVC = true;
257 cm1.PartialResolveDisableInVCMask = true;
258 }
259 #endif
260
261 anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
262
263 anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
264 rect.ClippedDrawingRectangleYMin = 0;
265 rect.ClippedDrawingRectangleXMin = 0;
266 rect.ClippedDrawingRectangleYMax = UINT16_MAX;
267 rect.ClippedDrawingRectangleXMax = UINT16_MAX;
268 rect.DrawingRectangleOriginY = 0;
269 rect.DrawingRectangleOriginX = 0;
270 }
271
272 #if GFX_VER >= 8
273 anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
274
275 genX(emit_sample_pattern)(&batch, NULL);
276
277 /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
278 * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
279 * Clear." It mentions that the packet overrides GPU state for the clear
280 * operation and needs to be reset to 0s to clear the overrides. Depending
281 * on the kernel, we may not get a context with the state for this packet
282 * zeroed. Do it ourselves just in case. We've observed this to prevent a
283 * number of GPU hangs on ICL.
284 */
285 anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
286 #endif
287
288 #if GFX_VER == 11
289 /* The default behavior of bit 5 "Headerless Message for Pre-emptable
290 * Contexts" in SAMPLER MODE register is set to 0, which means
291 * headerless sampler messages are not allowed for pre-emptable
292 * contexts. Set the bit 5 to 1 to allow them.
293 */
294 anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
295 sm.HeaderlessMessageforPreemptableContexts = true;
296 sm.HeaderlessMessageforPreemptableContextsMask = true;
297 }
298
299 /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
300 * HALF_SLICE_CHICKEN7 register.
301 */
302 anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
303 hsc7.EnabledTexelOffsetPrecisionFix = true;
304 hsc7.EnabledTexelOffsetPrecisionFixMask = true;
305 }
306
307 anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
308 tcc.L3DataPartialWriteMergingEnable = true;
309 tcc.ColorZPartialWriteMergingEnable = true;
310 tcc.URBPartialWriteMergingEnable = true;
311 tcc.TCDisable = true;
312 }
313 #endif
314 genX(emit_slice_hashing_state)(device, &batch);
315
316 #if GFX_VER >= 11
317 /* hardware specification recommends disabling repacking for
318 * the compatibility with decompression mechanism in display controller.
319 */
320 if (device->info.disable_ccs_repack) {
321 anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
322 cm0.DisableRepackingforCompression = true;
323 cm0.DisableRepackingforCompressionMask = true;
324 }
325 }
326
327 /* an unknown issue is causing vs push constants to become
328 * corrupted during object-level preemption. For now, restrict
329 * to command buffer level preemption to avoid rendering
330 * corruption.
331 */
332 anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
333 cc1.ReplayMode = MidcmdbufferPreemption;
334 cc1.ReplayModeMask = true;
335
336 #if GFX_VERx10 == 120
337 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
338 cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
339 #endif
340 }
341
342 #if GFX_VERx10 == 120
343 /* Wa_1806527549 says to disable the following HiZ optimization when the
344 * depth buffer is D16_UNORM. We've found the WA to help with more depth
345 * buffer configurations however, so we always disable it just to be safe.
346 */
347 anv_batch_write_reg(&batch, GENX(HIZ_CHICKEN), reg) {
348 reg.HZDepthTestLEGEOptimizationDisable = true;
349 reg.HZDepthTestLEGEOptimizationDisableMask = true;
350 }
351
352 /* Wa_1508744258
353 *
354 * Disable RHWO by setting 0x7010[14] by default except during resolve
355 * pass.
356 *
357 * We implement global disabling of the optimization here and we toggle it
358 * in anv_image_ccs_op().
359 */
360 anv_batch_write_reg(&batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
361 c1.RCCRHWOOptimizationDisable = true;
362 c1.RCCRHWOOptimizationDisableMask = true;
363 }
364 #endif
365
366 #if GFX_VERx10 < 125
367 #define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
368 #else
369 #define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
370 #endif
371
372 /* Enable the new line drawing algorithm that produces higher quality
373 * lines.
374 */
375 anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
376 c3.AALineQualityFix = true;
377 c3.AALineQualityFixMask = true;
378 }
379 #endif
380
381 #if GFX_VER == 12
382 if (device->info.has_aux_map) {
383 uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
384 assert(aux_base_addr % (32 * 1024) == 0);
385 anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
386 lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
387 lri.DataDWord = aux_base_addr & 0xffffffff;
388 }
389 anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
390 lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
391 lri.DataDWord = aux_base_addr >> 32;
392 }
393 }
394 #endif
395
396 /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
397 * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
398 *
399 * This is only safe on kernels with context isolation support.
400 */
401 if (GFX_VER >= 8 && device->physical->has_context_isolation) {
402 #if GFX_VER >= 9
403 anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
404 csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
405 csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
406 }
407 #elif GFX_VER == 8
408 anv_batch_write_reg(&batch, GENX(INSTPM), instpm) {
409 instpm.CONSTANT_BUFFERAddressOffsetDisable = true;
410 instpm.CONSTANT_BUFFERAddressOffsetDisableMask = true;
411 }
412 #endif
413 }
414
415 init_common_queue_state(queue, &batch);
416
417 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
418
419 assert(batch.next <= batch.end);
420
421 return anv_queue_submit_simple_batch(queue, &batch);
422 }
423
424 static VkResult
init_compute_queue_state(struct anv_queue * queue)425 init_compute_queue_state(struct anv_queue *queue)
426 {
427 struct anv_batch batch;
428
429 uint32_t cmds[64];
430 batch.start = batch.next = cmds;
431 batch.end = (void *) cmds + sizeof(cmds);
432
433 anv_batch_emit(&batch, GENX(PIPELINE_SELECT), ps) {
434 #if GFX_VER >= 9
435 ps.MaskBits = 3;
436 #endif
437 #if GFX_VER >= 11
438 ps.MaskBits |= 0x10;
439 ps.MediaSamplerDOPClockGateEnable = true;
440 #endif
441 ps.PipelineSelection = GPGPU;
442 }
443
444 init_common_queue_state(queue, &batch);
445
446 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
447
448 assert(batch.next <= batch.end);
449
450 return anv_queue_submit_simple_batch(queue, &batch);
451 }
452
453 void
genX(init_physical_device_state)454 genX(init_physical_device_state)(ASSERTED struct anv_physical_device *device)
455 {
456 assert(device->info.verx10 == GFX_VERx10);
457 }
458
459 VkResult
genX(init_device_state)460 genX(init_device_state)(struct anv_device *device)
461 {
462 VkResult res;
463
464 device->slice_hash = (struct anv_state) { 0 };
465 for (uint32_t i = 0; i < device->queue_count; i++) {
466 struct anv_queue *queue = &device->queues[i];
467 switch (queue->family->engine_class) {
468 case I915_ENGINE_CLASS_RENDER:
469 res = init_render_queue_state(queue);
470 break;
471 case I915_ENGINE_CLASS_COMPUTE:
472 res = init_compute_queue_state(queue);
473 break;
474 default:
475 res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
476 break;
477 }
478 if (res != VK_SUCCESS)
479 return res;
480 }
481
482 return res;
483 }
484
485 #if GFX_VERx10 >= 125
486 #define maybe_for_each_shading_rate_op(name) \
487 for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
488 name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
489 name++)
490 #elif GFX_VER >= 12
491 #define maybe_for_each_shading_rate_op(name)
492 #endif
493
494 /* Rather than reemitting the CPS_STATE structure everything those changes and
495 * for as many viewports as needed, we can just prepare all possible cases and
496 * just pick the right offset from the prepacked states when needed.
497 */
498 void
genX(init_cps_device_state)499 genX(init_cps_device_state)(struct anv_device *device)
500 {
501 #if GFX_VER >= 12
502 void *cps_state_ptr = device->cps_states.map;
503
504 /* Disabled CPS mode */
505 for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
506 struct GENX(CPS_STATE) cps_state = {
507 .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
508 .MinCPSizeX = 1,
509 .MinCPSizeY = 1,
510 #if GFX_VERx10 >= 125
511 .Combiner0OpcodeforCPsize = PASSTHROUGH,
512 .Combiner1OpcodeforCPsize = PASSTHROUGH,
513 #endif /* GFX_VERx10 >= 125 */
514
515 };
516
517 GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
518 cps_state_ptr += GENX(CPS_STATE_length) * 4;
519 }
520
521 maybe_for_each_shading_rate_op(op0) {
522 maybe_for_each_shading_rate_op(op1) {
523 for (uint32_t x = 1; x <= 4; x *= 2) {
524 for (uint32_t y = 1; y <= 4; y *= 2) {
525 struct GENX(CPS_STATE) cps_state = {
526 .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
527 .MinCPSizeX = x,
528 .MinCPSizeY = y,
529 };
530
531 #if GFX_VERx10 >= 125
532 static const uint32_t combiner_ops[] = {
533 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR] = PASSTHROUGH,
534 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
535 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR] = HIGH_QUALITY,
536 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR] = LOW_QUALITY,
537 [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR] = RELATIVE,
538 };
539
540 cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
541 cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
542 #endif /* GFX_VERx10 >= 125 */
543
544 for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
545 GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
546 cps_state_ptr += GENX(CPS_STATE_length) * 4;
547 }
548 }
549 }
550 }
551 }
552 #endif /* GFX_VER >= 12 */
553 }
554
555 #if GFX_VER >= 12
556 static uint32_t
get_cps_state_offset(struct anv_device * device,bool cps_enabled,const struct vk_fragment_shading_rate_state * fsr)557 get_cps_state_offset(struct anv_device *device, bool cps_enabled,
558 const struct vk_fragment_shading_rate_state *fsr)
559 {
560 if (!cps_enabled)
561 return device->cps_states.offset;
562
563 uint32_t offset;
564 static const uint32_t size_index[] = {
565 [1] = 0,
566 [2] = 1,
567 [4] = 2,
568 };
569
570 #if GFX_VERx10 >= 125
571 offset =
572 1 + /* skip disabled */
573 fsr->combiner_ops[0] * 5 * 3 * 3 +
574 fsr->combiner_ops[1] * 3 * 3 +
575 size_index[fsr->fragment_size.width] * 3 +
576 size_index[fsr->fragment_size.height];
577 #else
578 offset =
579 1 + /* skip disabled */
580 size_index[fsr->fragment_size.width] * 3 +
581 size_index[fsr->fragment_size.height];
582 #endif
583
584 offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
585
586 return device->cps_states.offset + offset;
587 }
588 #endif /* GFX_VER >= 12 */
589
590 void
genX(emit_l3_config)591 genX(emit_l3_config)(struct anv_batch *batch,
592 const struct anv_device *device,
593 const struct intel_l3_config *cfg)
594 {
595 UNUSED const struct intel_device_info *devinfo = &device->info;
596
597 #if GFX_VER >= 8
598
599 #if GFX_VER >= 12
600 #define L3_ALLOCATION_REG GENX(L3ALLOC)
601 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
602 #else
603 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
604 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
605 #endif
606
607 anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
608 if (cfg == NULL) {
609 #if GFX_VER >= 12
610 l3cr.L3FullWayAllocationEnable = true;
611 #else
612 unreachable("Invalid L3$ config");
613 #endif
614 } else {
615 #if GFX_VER < 11
616 l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
617 #endif
618 #if GFX_VER == 11
619 /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
620 * set in L3CNTLREG register. The default setting of the bit is not
621 * the desirable behavior.
622 */
623 l3cr.ErrorDetectionBehaviorControl = true;
624 l3cr.UseFullWays = true;
625 #endif /* GFX_VER == 11 */
626 assert(cfg->n[INTEL_L3P_IS] == 0);
627 assert(cfg->n[INTEL_L3P_C] == 0);
628 assert(cfg->n[INTEL_L3P_T] == 0);
629 l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
630 l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
631 l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
632 l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
633 }
634 }
635
636 #else /* GFX_VER < 8 */
637
638 const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
639 const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
640 cfg->n[INTEL_L3P_ALL];
641 const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
642 cfg->n[INTEL_L3P_ALL];
643 const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
644 cfg->n[INTEL_L3P_ALL];
645
646 assert(!cfg->n[INTEL_L3P_ALL]);
647
648 /* When enabled SLM only uses a portion of the L3 on half of the banks,
649 * the matching space on the remaining banks has to be allocated to a
650 * client (URB for all validated configurations) set to the
651 * lower-bandwidth 2-bank address hashing mode.
652 */
653 const bool urb_low_bw = cfg->n[INTEL_L3P_SLM] && devinfo->platform != INTEL_PLATFORM_BYT;
654 assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
655
656 /* Minimum number of ways that can be allocated to the URB. */
657 const unsigned n0_urb = devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0;
658 assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
659
660 anv_batch_write_reg(batch, GENX(L3SQCREG1), l3sqc) {
661 l3sqc.ConvertDC_UC = !has_dc;
662 l3sqc.ConvertIS_UC = !has_is;
663 l3sqc.ConvertC_UC = !has_c;
664 l3sqc.ConvertT_UC = !has_t;
665 #if GFX_VERx10 == 75
666 l3sqc.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
667 #else
668 l3sqc.L3SQGeneralPriorityCreditInitialization =
669 devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
670 #endif
671 l3sqc.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
672 }
673
674 anv_batch_write_reg(batch, GENX(L3CNTLREG2), l3cr2) {
675 l3cr2.SLMEnable = cfg->n[INTEL_L3P_SLM];
676 l3cr2.URBLowBandwidth = urb_low_bw;
677 l3cr2.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
678 #if !GFX_VERx10 == 75
679 l3cr2.ALLAllocation = cfg->n[INTEL_L3P_ALL];
680 #endif
681 l3cr2.ROAllocation = cfg->n[INTEL_L3P_RO];
682 l3cr2.DCAllocation = cfg->n[INTEL_L3P_DC];
683 }
684
685 anv_batch_write_reg(batch, GENX(L3CNTLREG3), l3cr3) {
686 l3cr3.ISAllocation = cfg->n[INTEL_L3P_IS];
687 l3cr3.ISLowBandwidth = 0;
688 l3cr3.CAllocation = cfg->n[INTEL_L3P_C];
689 l3cr3.CLowBandwidth = 0;
690 l3cr3.TAllocation = cfg->n[INTEL_L3P_T];
691 l3cr3.TLowBandwidth = 0;
692 }
693
694 #if GFX_VERx10 == 75
695 if (device->physical->cmd_parser_version >= 4) {
696 /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
697 * them disabled to avoid crashing the system hard.
698 */
699 anv_batch_write_reg(batch, GENX(SCRATCH1), s1) {
700 s1.L3AtomicDisable = !has_dc;
701 }
702 anv_batch_write_reg(batch, GENX(CHICKEN3), c3) {
703 c3.L3AtomicDisableMask = true;
704 c3.L3AtomicDisable = !has_dc;
705 }
706 }
707 #endif /* GFX_VERx10 == 75 */
708
709 #endif /* GFX_VER < 8 */
710 }
711
712 void
genX(emit_multisample)713 genX(emit_multisample)(struct anv_batch *batch, uint32_t samples,
714 const struct vk_sample_locations_state *sl)
715 {
716 if (sl != NULL) {
717 assert(sl->per_pixel == samples);
718 assert(sl->grid_size.width == 1);
719 assert(sl->grid_size.height == 1);
720 } else {
721 sl = vk_standard_sample_locations_state(samples);
722 }
723
724 anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
725 ms.NumberofMultisamples = __builtin_ffs(samples) - 1;
726
727 ms.PixelLocation = CENTER;
728 #if GFX_VER >= 8
729 /* The PRM says that this bit is valid only for DX9:
730 *
731 * SW can choose to set this bit only for DX9 API. DX10/OGL API's
732 * should not have any effect by setting or not setting this bit.
733 */
734 ms.PixelPositionOffsetEnable = false;
735 #else
736 switch (samples) {
737 case 1:
738 INTEL_SAMPLE_POS_1X_ARRAY(ms.Sample, sl->locations);
739 break;
740 case 2:
741 INTEL_SAMPLE_POS_2X_ARRAY(ms.Sample, sl->locations);
742 break;
743 case 4:
744 INTEL_SAMPLE_POS_4X_ARRAY(ms.Sample, sl->locations);
745 break;
746 case 8:
747 INTEL_SAMPLE_POS_8X_ARRAY(ms.Sample, sl->locations);
748 break;
749 default:
750 break;
751 }
752 #endif
753 }
754 }
755
756 #if GFX_VER >= 8
757 void
genX(emit_sample_pattern)758 genX(emit_sample_pattern)(struct anv_batch *batch,
759 const struct vk_sample_locations_state *sl)
760 {
761 assert(sl == NULL || sl->grid_size.width == 1);
762 assert(sl == NULL || sl->grid_size.height == 1);
763
764 /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
765 * VkPhysicalDeviceFeatures::standardSampleLocations.
766 */
767 anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
768 /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
769 *
770 * "When programming the sample offsets (for NUMSAMPLES_4 or _8
771 * and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
772 * (or 7 for 8X, or 15 for 16X) must have monotonically increasing
773 * distance from the pixel center. This is required to get the
774 * correct centroid computation in the device."
775 *
776 * However, the Vulkan spec seems to require that the the samples occur
777 * in the order provided through the API. The standard sample patterns
778 * have the above property that they have monotonically increasing
779 * distances from the center but client-provided ones do not. As long as
780 * this only affects centroid calculations as the docs say, we should be
781 * ok because OpenGL and Vulkan only require that the centroid be some
782 * lit sample and that it's the same for all samples in a pixel; they
783 * have no requirement that it be the one closest to center.
784 */
785 for (uint32_t i = 1; i <= (GFX_VER >= 9 ? 16 : 8); i *= 2) {
786 switch (i) {
787 case VK_SAMPLE_COUNT_1_BIT:
788 if (sl && sl->per_pixel == i) {
789 INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
790 } else {
791 INTEL_SAMPLE_POS_1X(sp._1xSample);
792 }
793 break;
794 case VK_SAMPLE_COUNT_2_BIT:
795 if (sl && sl->per_pixel == i) {
796 INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
797 } else {
798 INTEL_SAMPLE_POS_2X(sp._2xSample);
799 }
800 break;
801 case VK_SAMPLE_COUNT_4_BIT:
802 if (sl && sl->per_pixel == i) {
803 INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
804 } else {
805 INTEL_SAMPLE_POS_4X(sp._4xSample);
806 }
807 break;
808 case VK_SAMPLE_COUNT_8_BIT:
809 if (sl && sl->per_pixel == i) {
810 INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
811 } else {
812 INTEL_SAMPLE_POS_8X(sp._8xSample);
813 }
814 break;
815 #if GFX_VER >= 9
816 case VK_SAMPLE_COUNT_16_BIT:
817 if (sl && sl->per_pixel == i) {
818 INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
819 } else {
820 INTEL_SAMPLE_POS_16X(sp._16xSample);
821 }
822 break;
823 #endif
824 default:
825 unreachable("Invalid sample count");
826 }
827 }
828 }
829 }
830 #endif
831
832 #if GFX_VER >= 11
833 void
genX(emit_shading_rate)834 genX(emit_shading_rate)(struct anv_batch *batch,
835 const struct anv_graphics_pipeline *pipeline,
836 const struct vk_fragment_shading_rate_state *fsr)
837 {
838 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
839 const bool cps_enable = wm_prog_data && wm_prog_data->per_coarse_pixel_dispatch;
840
841 #if GFX_VER == 11
842 anv_batch_emit(batch, GENX(3DSTATE_CPS), cps) {
843 cps.CoarsePixelShadingMode = cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE;
844 if (cps_enable) {
845 cps.MinCPSizeX = fsr->fragment_size.width;
846 cps.MinCPSizeY = fsr->fragment_size.height;
847 }
848 }
849 #elif GFX_VER >= 12
850 /* TODO: we can optimize this flush in the following cases:
851 *
852 * In the case where the last geometry shader emits a value that is not
853 * constant, we can avoid this stall because we can synchronize the
854 * pixel shader internally with
855 * 3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
856 *
857 * If we know that the previous pipeline and the current one are using
858 * the same fragment shading rate.
859 */
860 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
861 #if GFX_VERx10 >= 125
862 pc.PSSStallSyncEnable = true;
863 #else
864 pc.PSDSyncEnable = true;
865 #endif
866 }
867
868 anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) {
869 struct anv_device *device = pipeline->base.device;
870
871 cps.CoarsePixelShadingStateArrayPointer =
872 get_cps_state_offset(device, cps_enable, fsr);
873 }
874 #endif
875 }
876 #endif /* GFX_VER >= 11 */
877
878 static uint32_t
vk_to_intel_tex_filter(VkFilter filter,bool anisotropyEnable)879 vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
880 {
881 switch (filter) {
882 default:
883 assert(!"Invalid filter");
884 case VK_FILTER_NEAREST:
885 return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
886 case VK_FILTER_LINEAR:
887 return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
888 }
889 }
890
891 static uint32_t
vk_to_intel_max_anisotropy(float ratio)892 vk_to_intel_max_anisotropy(float ratio)
893 {
894 return (anv_clamp_f(ratio, 2, 16) - 2) / 2;
895 }
896
897 static const uint32_t vk_to_intel_mipmap_mode[] = {
898 [VK_SAMPLER_MIPMAP_MODE_NEAREST] = MIPFILTER_NEAREST,
899 [VK_SAMPLER_MIPMAP_MODE_LINEAR] = MIPFILTER_LINEAR
900 };
901
902 static const uint32_t vk_to_intel_tex_address[] = {
903 [VK_SAMPLER_ADDRESS_MODE_REPEAT] = TCM_WRAP,
904 [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
905 [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE] = TCM_CLAMP,
906 [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
907 [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
908 };
909
910 /* Vulkan specifies the result of shadow comparisons as:
911 * 1 if ref <op> texel,
912 * 0 otherwise.
913 *
914 * The hardware does:
915 * 0 if texel <op> ref,
916 * 1 otherwise.
917 *
918 * So, these look a bit strange because there's both a negation
919 * and swapping of the arguments involved.
920 */
921 static const uint32_t vk_to_intel_shadow_compare_op[] = {
922 [VK_COMPARE_OP_NEVER] = PREFILTEROP_ALWAYS,
923 [VK_COMPARE_OP_LESS] = PREFILTEROP_LEQUAL,
924 [VK_COMPARE_OP_EQUAL] = PREFILTEROP_NOTEQUAL,
925 [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LESS,
926 [VK_COMPARE_OP_GREATER] = PREFILTEROP_GEQUAL,
927 [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_EQUAL,
928 [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GREATER,
929 [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_NEVER,
930 };
931
932 #if GFX_VER >= 9
933 static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
934 [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
935 [VK_SAMPLER_REDUCTION_MODE_MIN] = MINIMUM,
936 [VK_SAMPLER_REDUCTION_MODE_MAX] = MAXIMUM,
937 };
938 #endif
939
genX(CreateSampler)940 VkResult genX(CreateSampler)(
941 VkDevice _device,
942 const VkSamplerCreateInfo* pCreateInfo,
943 const VkAllocationCallbacks* pAllocator,
944 VkSampler* pSampler)
945 {
946 ANV_FROM_HANDLE(anv_device, device, _device);
947 struct anv_sampler *sampler;
948
949 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO);
950
951 sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler),
952 VK_OBJECT_TYPE_SAMPLER);
953 if (!sampler)
954 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
955
956 sampler->n_planes = 1;
957
958 uint32_t border_color_stride = GFX_VERx10 == 75 ? 512 : 64;
959 uint32_t border_color_offset;
960 ASSERTED bool has_custom_color = false;
961 if (pCreateInfo->borderColor <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
962 border_color_offset = device->border_colors.offset +
963 pCreateInfo->borderColor *
964 border_color_stride;
965 } else {
966 assert(GFX_VER >= 8);
967 sampler->custom_border_color =
968 anv_state_reserved_pool_alloc(&device->custom_border_colors);
969 border_color_offset = sampler->custom_border_color.offset;
970 }
971
972 #if GFX_VER >= 9
973 unsigned sampler_reduction_mode = STD_FILTER;
974 bool enable_sampler_reduction = false;
975 #endif
976
977 vk_foreach_struct_const(ext, pCreateInfo->pNext) {
978 switch (ext->sType) {
979 case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO: {
980 VkSamplerYcbcrConversionInfo *pSamplerConversion =
981 (VkSamplerYcbcrConversionInfo *) ext;
982 ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion,
983 pSamplerConversion->conversion);
984
985 /* Ignore conversion for non-YUV formats. This fulfills a requirement
986 * for clients that want to utilize same code path for images with
987 * external formats (VK_FORMAT_UNDEFINED) and "regular" RGBA images
988 * where format is known.
989 */
990 if (conversion == NULL || !conversion->format->can_ycbcr)
991 break;
992
993 sampler->n_planes = conversion->format->n_planes;
994 sampler->conversion = conversion;
995 break;
996 }
997 #if GFX_VER >= 9
998 case VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO: {
999 VkSamplerReductionModeCreateInfo *sampler_reduction =
1000 (VkSamplerReductionModeCreateInfo *) ext;
1001 sampler_reduction_mode =
1002 vk_to_intel_sampler_reduction_mode[sampler_reduction->reductionMode];
1003 enable_sampler_reduction = true;
1004 break;
1005 }
1006 #endif
1007 case VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT: {
1008 VkSamplerCustomBorderColorCreateInfoEXT *custom_border_color =
1009 (VkSamplerCustomBorderColorCreateInfoEXT *) ext;
1010 if (sampler->custom_border_color.map == NULL)
1011 break;
1012
1013 union isl_color_value color = { .u32 = {
1014 custom_border_color->customBorderColor.uint32[0],
1015 custom_border_color->customBorderColor.uint32[1],
1016 custom_border_color->customBorderColor.uint32[2],
1017 custom_border_color->customBorderColor.uint32[3],
1018 } };
1019
1020 const struct anv_format *format_desc =
1021 custom_border_color->format != VK_FORMAT_UNDEFINED ?
1022 anv_get_format(custom_border_color->format) : NULL;
1023
1024 /* For formats with a swizzle, it does not carry over to the sampler
1025 * for border colors, so we need to do the swizzle ourselves here.
1026 */
1027 if (format_desc && format_desc->n_planes == 1 &&
1028 !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
1029 const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
1030
1031 assert(!isl_format_has_int_channel(fmt_plane->isl_format));
1032 color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
1033 }
1034
1035 memcpy(sampler->custom_border_color.map, color.u32, sizeof(color));
1036 has_custom_color = true;
1037 break;
1038 }
1039 case VK_STRUCTURE_TYPE_SAMPLER_BORDER_COLOR_COMPONENT_MAPPING_CREATE_INFO_EXT:
1040 break;
1041 default:
1042 anv_debug_ignored_stype(ext->sType);
1043 break;
1044 }
1045 }
1046
1047 assert((sampler->custom_border_color.map == NULL) || has_custom_color);
1048
1049 if (device->physical->has_bindless_samplers) {
1050 /* If we have bindless, allocate enough samplers. We allocate 32 bytes
1051 * for each sampler instead of 16 bytes because we want all bindless
1052 * samplers to be 32-byte aligned so we don't have to use indirect
1053 * sampler messages on them.
1054 */
1055 sampler->bindless_state =
1056 anv_state_pool_alloc(&device->dynamic_state_pool,
1057 sampler->n_planes * 32, 32);
1058 }
1059
1060 const bool seamless_cube =
1061 !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
1062
1063 for (unsigned p = 0; p < sampler->n_planes; p++) {
1064 const bool plane_has_chroma =
1065 sampler->conversion && sampler->conversion->format->planes[p].has_chroma;
1066 const VkFilter min_filter =
1067 plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->minFilter;
1068 const VkFilter mag_filter =
1069 plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->magFilter;
1070 const bool enable_min_filter_addr_rounding = min_filter != VK_FILTER_NEAREST;
1071 const bool enable_mag_filter_addr_rounding = mag_filter != VK_FILTER_NEAREST;
1072 /* From Broadwell PRM, SAMPLER_STATE:
1073 * "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
1074 */
1075 const bool isl_format_is_planar_yuv = sampler->conversion &&
1076 isl_format_is_yuv(sampler->conversion->format->planes[0].isl_format) &&
1077 isl_format_is_planar(sampler->conversion->format->planes[0].isl_format);
1078
1079 const uint32_t mip_filter_mode =
1080 isl_format_is_planar_yuv ?
1081 MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
1082
1083 struct GENX(SAMPLER_STATE) sampler_state = {
1084 .SamplerDisable = false,
1085 .TextureBorderColorMode = DX10OGL,
1086
1087 #if GFX_VER >= 11
1088 .CPSLODCompensationEnable = true,
1089 #endif
1090
1091 #if GFX_VER >= 8
1092 .LODPreClampMode = CLAMP_MODE_OGL,
1093 #else
1094 .LODPreClampEnable = CLAMP_ENABLE_OGL,
1095 #endif
1096
1097 #if GFX_VER == 8
1098 .BaseMipLevel = 0.0,
1099 #endif
1100 .MipModeFilter = mip_filter_mode,
1101 .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
1102 .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
1103 .TextureLODBias = anv_clamp_f(pCreateInfo->mipLodBias, -16, 15.996),
1104 .AnisotropicAlgorithm =
1105 pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
1106 .MinLOD = anv_clamp_f(pCreateInfo->minLod, 0, 14),
1107 .MaxLOD = anv_clamp_f(pCreateInfo->maxLod, 0, 14),
1108 .ChromaKeyEnable = 0,
1109 .ChromaKeyIndex = 0,
1110 .ChromaKeyMode = 0,
1111 .ShadowFunction =
1112 vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
1113 pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
1114 .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
1115
1116 .BorderColorPointer = border_color_offset,
1117
1118 #if GFX_VER >= 8
1119 .LODClampMagnificationMode = MIPNONE,
1120 #endif
1121
1122 .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
1123 .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1124 .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1125 .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1126 .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1127 .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
1128 .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
1129 .TrilinearFilterQuality = 0,
1130 .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
1131 .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
1132 .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
1133 .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
1134
1135 #if GFX_VER >= 9
1136 .ReductionType = sampler_reduction_mode,
1137 .ReductionTypeEnable = enable_sampler_reduction,
1138 #endif
1139 };
1140
1141 GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
1142
1143 if (sampler->bindless_state.map) {
1144 memcpy(sampler->bindless_state.map + p * 32,
1145 sampler->state[p], GENX(SAMPLER_STATE_length) * 4);
1146 }
1147 }
1148
1149 *pSampler = anv_sampler_to_handle(sampler);
1150
1151 return VK_SUCCESS;
1152 }
1153