1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30
31 #include "util/libdrm.h"
32
33 #include "intel_device_info.h"
34 #include "intel_hwconfig.h"
35 #include "intel_wa.h"
36 #include "i915/intel_device_info.h"
37 #include "xe/intel_device_info.h"
38
39 #include "common/intel_gem.h"
40 #include "util/u_debug.h"
41 #include "util/log.h"
42 #include "util/macros.h"
43
44 static const struct {
45 const char *name;
46 int pci_id;
47 } name_map[] = {
48 { "lpt", 0x27a2 },
49 { "brw", 0x2a02 },
50 { "g4x", 0x2a42 },
51 { "ilk", 0x0042 },
52 { "snb", 0x0126 },
53 { "ivb", 0x016a },
54 { "hsw", 0x0d2e },
55 { "byt", 0x0f33 },
56 { "bdw", 0x162e },
57 { "chv", 0x22B3 },
58 { "skl", 0x1912 },
59 { "bxt", 0x5A85 },
60 { "kbl", 0x5912 },
61 { "aml", 0x591C },
62 { "glk", 0x3185 },
63 { "cfl", 0x3E9B },
64 { "whl", 0x3EA1 },
65 { "cml", 0x9b41 },
66 { "icl", 0x8a52 },
67 { "ehl", 0x4571 },
68 { "jsl", 0x4E71 },
69 { "tgl", 0x9a49 },
70 { "rkl", 0x4c8a },
71 { "dg1", 0x4905 },
72 { "adl", 0x4680 },
73 { "sg1", 0x4907 },
74 { "rpl", 0xa780 },
75 { "dg2", 0x5690 },
76 { "mtl", 0x7d60 },
77 { "arl", 0x7d67 },
78 { "lnl", 0x64a0 },
79 { "bmg", 0xe202 },
80 { "ptl", 0xb080 },
81 };
82
83 /**
84 * Get the PCI ID for the device name.
85 *
86 * Returns -1 if the device is not known.
87 */
88 int
intel_device_name_to_pci_device_id(const char * name)89 intel_device_name_to_pci_device_id(const char *name)
90 {
91 for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
92 if (!strcmp(name_map[i].name, name))
93 return name_map[i].pci_id;
94 }
95
96 return -1;
97 }
98
99 static const struct intel_device_info intel_device_info_gfx3 = {
100 .ver = 3,
101 .platform = INTEL_PLATFORM_GFX3,
102 .simulator_id = -1,
103 .num_slices = 1,
104 .num_subslices = { 1, },
105 .max_eus_per_subslice = 8,
106 .num_thread_per_eu = 4,
107 .grf_size = 32,
108 .timestamp_frequency = 12500000,
109 };
110
111 static const struct intel_device_info intel_device_info_i965 = {
112 .ver = 4,
113 .platform = INTEL_PLATFORM_I965,
114 .has_negative_rhw_bug = true,
115 .num_slices = 1,
116 .num_subslices = { 1, },
117 .max_eus_per_subslice = 8,
118 .num_thread_per_eu = 4,
119 .grf_size = 32,
120 .max_vs_threads = 16,
121 .max_gs_threads = 2,
122 .max_wm_threads = 8 * 4,
123 .urb = {
124 .size = 256,
125 },
126 .timestamp_frequency = 12500000,
127 .simulator_id = -1,
128 };
129
130 static const struct intel_device_info intel_device_info_g4x = {
131 .ver = 4,
132 .verx10 = 45,
133 .has_pln = true,
134 .has_compr4 = true,
135 .has_surface_tile_offset = true,
136 .platform = INTEL_PLATFORM_G4X,
137 .num_slices = 1,
138 .num_subslices = { 1, },
139 .max_eus_per_subslice = 10,
140 .num_thread_per_eu = 5,
141 .grf_size = 32,
142 .max_vs_threads = 32,
143 .max_gs_threads = 2,
144 .max_wm_threads = 10 * 5,
145 .urb = {
146 .size = 384,
147 },
148 .timestamp_frequency = 12500000,
149 .simulator_id = -1,
150 };
151
152 static const struct intel_device_info intel_device_info_ilk = {
153 .ver = 5,
154 .platform = INTEL_PLATFORM_ILK,
155 .has_pln = true,
156 .has_compr4 = true,
157 .has_surface_tile_offset = true,
158 .num_slices = 1,
159 .num_subslices = { 1, },
160 .max_eus_per_subslice = 12,
161 .num_thread_per_eu = 6,
162 .grf_size = 32,
163 .max_vs_threads = 72,
164 .max_gs_threads = 32,
165 .max_wm_threads = 12 * 6,
166 .urb = {
167 .size = 1024,
168 },
169 .timestamp_frequency = 12500000,
170 .simulator_id = -1,
171 };
172
173 static const struct intel_device_info intel_device_info_snb_gt1 = {
174 .ver = 6,
175 .gt = 1,
176 .platform = INTEL_PLATFORM_SNB,
177 .has_hiz_and_separate_stencil = true,
178 .has_llc = true,
179 .has_pln = true,
180 .has_surface_tile_offset = true,
181 .needs_unlit_centroid_workaround = true,
182 .num_slices = 1,
183 .num_subslices = { 1, },
184 .max_eus_per_subslice = 6,
185 .num_thread_per_eu = 6, /* Not confirmed */
186 .grf_size = 32,
187 .max_vs_threads = 24,
188 .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
189 .max_wm_threads = 40,
190 .urb = {
191 .size = 32,
192 .min_entries = {
193 [MESA_SHADER_VERTEX] = 24,
194 },
195 .max_entries = {
196 [MESA_SHADER_VERTEX] = 256,
197 [MESA_SHADER_GEOMETRY] = 256,
198 },
199 },
200 .timestamp_frequency = 12500000,
201 .simulator_id = -1,
202 };
203
204 static const struct intel_device_info intel_device_info_snb_gt2 = {
205 .ver = 6,
206 .gt = 2,
207 .platform = INTEL_PLATFORM_SNB,
208 .has_hiz_and_separate_stencil = true,
209 .has_llc = true,
210 .has_pln = true,
211 .has_surface_tile_offset = true,
212 .needs_unlit_centroid_workaround = true,
213 .num_slices = 1,
214 .num_subslices = { 1, },
215 .max_eus_per_subslice = 12,
216 .num_thread_per_eu = 6, /* Not confirmed */
217 .grf_size = 32,
218 .max_vs_threads = 60,
219 .max_gs_threads = 60,
220 .max_wm_threads = 80,
221 .urb = {
222 .size = 64,
223 .min_entries = {
224 [MESA_SHADER_VERTEX] = 24,
225 },
226 .max_entries = {
227 [MESA_SHADER_VERTEX] = 256,
228 [MESA_SHADER_GEOMETRY] = 256,
229 },
230 },
231 .timestamp_frequency = 12500000,
232 .simulator_id = -1,
233 };
234
235 #define GFX7_FEATURES \
236 .ver = 7, \
237 .has_hiz_and_separate_stencil = true, \
238 .must_use_separate_stencil = true, \
239 .has_llc = true, \
240 .has_pln = true, \
241 .has_64bit_float = true, \
242 .has_surface_tile_offset = true, \
243 .grf_size = 32, \
244 .timestamp_frequency = 12500000, \
245 .max_constant_urb_size_kb = 16
246
247 static const struct intel_device_info intel_device_info_ivb_gt1 = {
248 GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 1,
249 .num_slices = 1,
250 .num_subslices = { 1, },
251 .max_eus_per_subslice = 6,
252 .num_thread_per_eu = 6,
253 .l3_banks = 2,
254 .max_vs_threads = 36,
255 .max_tcs_threads = 36,
256 .max_tes_threads = 36,
257 .max_gs_threads = 36,
258 .max_wm_threads = 48,
259 .max_cs_threads = 36,
260 .urb = {
261 .min_entries = {
262 [MESA_SHADER_VERTEX] = 32,
263 [MESA_SHADER_TESS_EVAL] = 10,
264 },
265 .max_entries = {
266 [MESA_SHADER_VERTEX] = 512,
267 [MESA_SHADER_TESS_CTRL] = 32,
268 [MESA_SHADER_TESS_EVAL] = 288,
269 [MESA_SHADER_GEOMETRY] = 192,
270 },
271 },
272 .simulator_id = 7,
273 };
274
275 static const struct intel_device_info intel_device_info_ivb_gt2 = {
276 GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 2,
277 .num_slices = 1,
278 .num_subslices = { 1, },
279 .max_eus_per_subslice = 12,
280 .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
281 * @max_wm_threads ... */
282 .l3_banks = 4,
283 .max_vs_threads = 128,
284 .max_tcs_threads = 128,
285 .max_tes_threads = 128,
286 .max_gs_threads = 128,
287 .max_wm_threads = 172,
288 .max_cs_threads = 64,
289 .urb = {
290 .min_entries = {
291 [MESA_SHADER_VERTEX] = 32,
292 [MESA_SHADER_TESS_EVAL] = 10,
293 },
294 .max_entries = {
295 [MESA_SHADER_VERTEX] = 704,
296 [MESA_SHADER_TESS_CTRL] = 64,
297 [MESA_SHADER_TESS_EVAL] = 448,
298 [MESA_SHADER_GEOMETRY] = 320,
299 },
300 },
301 .simulator_id = 7,
302 };
303
304 static const struct intel_device_info intel_device_info_byt = {
305 GFX7_FEATURES, .platform = INTEL_PLATFORM_BYT, .gt = 1,
306 .num_slices = 1,
307 .num_subslices = { 1, },
308 .max_eus_per_subslice = 4,
309 .num_thread_per_eu = 8,
310 .l3_banks = 1,
311 .has_llc = false,
312 .max_vs_threads = 36,
313 .max_tcs_threads = 36,
314 .max_tes_threads = 36,
315 .max_gs_threads = 36,
316 .max_wm_threads = 48,
317 .max_cs_threads = 32,
318 .urb = {
319 .min_entries = {
320 [MESA_SHADER_VERTEX] = 32,
321 [MESA_SHADER_TESS_EVAL] = 10,
322 },
323 .max_entries = {
324 [MESA_SHADER_VERTEX] = 512,
325 [MESA_SHADER_TESS_CTRL] = 32,
326 [MESA_SHADER_TESS_EVAL] = 288,
327 [MESA_SHADER_GEOMETRY] = 192,
328 },
329 },
330 .simulator_id = 10,
331 };
332
333 #define HSW_FEATURES \
334 GFX7_FEATURES, \
335 .platform = INTEL_PLATFORM_HSW, \
336 .verx10 = 75, \
337 .supports_simd16_3src = true
338
339 static const struct intel_device_info intel_device_info_hsw_gt1 = {
340 HSW_FEATURES, .gt = 1,
341 .num_slices = 1,
342 .num_subslices = { 1, },
343 .max_eus_per_subslice = 10,
344 .num_thread_per_eu = 7,
345 .l3_banks = 2,
346 .max_vs_threads = 70,
347 .max_tcs_threads = 70,
348 .max_tes_threads = 70,
349 .max_gs_threads = 70,
350 .max_wm_threads = 102,
351 .max_cs_threads = 70,
352 .urb = {
353 .min_entries = {
354 [MESA_SHADER_VERTEX] = 32,
355 [MESA_SHADER_TESS_EVAL] = 10,
356 },
357 .max_entries = {
358 [MESA_SHADER_VERTEX] = 640,
359 [MESA_SHADER_TESS_CTRL] = 64,
360 [MESA_SHADER_TESS_EVAL] = 384,
361 [MESA_SHADER_GEOMETRY] = 256,
362 },
363 },
364 .simulator_id = 9,
365 };
366
367 static const struct intel_device_info intel_device_info_hsw_gt2 = {
368 HSW_FEATURES, .gt = 2,
369 .num_slices = 1,
370 .num_subslices = { 2, },
371 .max_eus_per_subslice = 10,
372 .num_thread_per_eu = 7,
373 .l3_banks = 4,
374 .max_vs_threads = 280,
375 .max_tcs_threads = 256,
376 .max_tes_threads = 280,
377 .max_gs_threads = 256,
378 .max_wm_threads = 204,
379 .max_cs_threads = 70,
380 .urb = {
381 .min_entries = {
382 [MESA_SHADER_VERTEX] = 64,
383 [MESA_SHADER_TESS_EVAL] = 10,
384 },
385 .max_entries = {
386 [MESA_SHADER_VERTEX] = 1664,
387 [MESA_SHADER_TESS_CTRL] = 128,
388 [MESA_SHADER_TESS_EVAL] = 960,
389 [MESA_SHADER_GEOMETRY] = 640,
390 },
391 },
392 .simulator_id = 9,
393 };
394
395 static const struct intel_device_info intel_device_info_hsw_gt3 = {
396 HSW_FEATURES, .gt = 3,
397 .num_slices = 2,
398 .num_subslices = { 2, 2, },
399 .max_eus_per_subslice = 10,
400 .num_thread_per_eu = 7,
401 .l3_banks = 8,
402 .max_vs_threads = 280,
403 .max_tcs_threads = 256,
404 .max_tes_threads = 280,
405 .max_gs_threads = 256,
406 .max_wm_threads = 408,
407 .max_cs_threads = 70,
408 .urb = {
409 .min_entries = {
410 [MESA_SHADER_VERTEX] = 64,
411 [MESA_SHADER_TESS_EVAL] = 10,
412 },
413 .max_entries = {
414 [MESA_SHADER_VERTEX] = 1664,
415 [MESA_SHADER_TESS_CTRL] = 128,
416 [MESA_SHADER_TESS_EVAL] = 960,
417 [MESA_SHADER_GEOMETRY] = 640,
418 },
419 },
420 .max_constant_urb_size_kb = 32,
421 .simulator_id = 9,
422 };
423
424 /* It's unclear how well supported sampling from the hiz buffer is on GFX8,
425 * so keep things conservative for now and set has_sample_with_hiz = false.
426 */
427 #define GFX8_FEATURES \
428 .ver = 8, \
429 .has_hiz_and_separate_stencil = true, \
430 .must_use_separate_stencil = true, \
431 .has_llc = true, \
432 .has_sample_with_hiz = false, \
433 .has_pln = true, \
434 .has_integer_dword_mul = true, \
435 .has_64bit_float = true, \
436 .has_64bit_int = true, \
437 .supports_simd16_3src = true, \
438 .has_surface_tile_offset = true, \
439 .num_thread_per_eu = 7, \
440 .grf_size = 32, \
441 .max_vs_threads = 504, \
442 .max_tcs_threads = 504, \
443 .max_tes_threads = 504, \
444 .max_gs_threads = 504, \
445 .max_wm_threads = 384, \
446 .max_threads_per_psd = 64, \
447 .timestamp_frequency = 12500000, \
448 .max_constant_urb_size_kb = 32
449
450 static const struct intel_device_info intel_device_info_bdw_gt1 = {
451 GFX8_FEATURES, .gt = 1,
452 .platform = INTEL_PLATFORM_BDW,
453 .num_slices = 1,
454 .num_subslices = { 2, },
455 .max_eus_per_subslice = 6,
456 .l3_banks = 2,
457 .max_cs_threads = 42,
458 .urb = {
459 .min_entries = {
460 [MESA_SHADER_VERTEX] = 64,
461 [MESA_SHADER_TESS_EVAL] = 34,
462 },
463 .max_entries = {
464 [MESA_SHADER_VERTEX] = 2560,
465 [MESA_SHADER_TESS_CTRL] = 504,
466 [MESA_SHADER_TESS_EVAL] = 1536,
467 /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */
468 [MESA_SHADER_GEOMETRY] = 690,
469 },
470 },
471 .simulator_id = 11,
472 };
473
474 static const struct intel_device_info intel_device_info_bdw_gt2 = {
475 GFX8_FEATURES, .gt = 2,
476 .platform = INTEL_PLATFORM_BDW,
477 .num_slices = 1,
478 .num_subslices = { 3, },
479 .max_eus_per_subslice = 8,
480 .l3_banks = 4,
481 .max_cs_threads = 56,
482 .urb = {
483 .min_entries = {
484 [MESA_SHADER_VERTEX] = 64,
485 [MESA_SHADER_TESS_EVAL] = 34,
486 },
487 .max_entries = {
488 [MESA_SHADER_VERTEX] = 2560,
489 [MESA_SHADER_TESS_CTRL] = 504,
490 [MESA_SHADER_TESS_EVAL] = 1536,
491 [MESA_SHADER_GEOMETRY] = 960,
492 },
493 },
494 .simulator_id = 11,
495 };
496
497 static const struct intel_device_info intel_device_info_bdw_gt3 = {
498 GFX8_FEATURES, .gt = 3,
499 .platform = INTEL_PLATFORM_BDW,
500 .num_slices = 2,
501 .num_subslices = { 3, 3, },
502 .max_eus_per_subslice = 8,
503 .l3_banks = 8,
504 .max_cs_threads = 56,
505 .urb = {
506 .min_entries = {
507 [MESA_SHADER_VERTEX] = 64,
508 [MESA_SHADER_TESS_EVAL] = 34,
509 },
510 .max_entries = {
511 [MESA_SHADER_VERTEX] = 2560,
512 [MESA_SHADER_TESS_CTRL] = 504,
513 [MESA_SHADER_TESS_EVAL] = 1536,
514 [MESA_SHADER_GEOMETRY] = 960,
515 },
516 },
517 .simulator_id = 11,
518 };
519
520 static const struct intel_device_info intel_device_info_chv = {
521 GFX8_FEATURES, .platform = INTEL_PLATFORM_CHV, .gt = 1,
522 .has_llc = false,
523 .has_integer_dword_mul = false,
524 .num_slices = 1,
525 .num_subslices = { 2, },
526 .max_eus_per_subslice = 8,
527 .l3_banks = 2,
528 .max_vs_threads = 80,
529 .max_tcs_threads = 80,
530 .max_tes_threads = 80,
531 .max_gs_threads = 80,
532 .max_wm_threads = 128,
533 .max_cs_threads = 6 * 7,
534 .urb = {
535 .min_entries = {
536 [MESA_SHADER_VERTEX] = 34,
537 [MESA_SHADER_TESS_EVAL] = 34,
538 },
539 .max_entries = {
540 [MESA_SHADER_VERTEX] = 640,
541 [MESA_SHADER_TESS_CTRL] = 80,
542 [MESA_SHADER_TESS_EVAL] = 384,
543 [MESA_SHADER_GEOMETRY] = 256,
544 },
545 },
546 .simulator_id = 13,
547 };
548
549 #define GFX9_HW_INFO \
550 .ver = 9, \
551 .max_vs_threads = 336, \
552 .max_gs_threads = 336, \
553 .max_tcs_threads = 336, \
554 .max_tes_threads = 336, \
555 .max_threads_per_psd = 64, \
556 .max_cs_threads = 56, \
557 .timestamp_frequency = 12000000, \
558 .urb = { \
559 .min_entries = { \
560 [MESA_SHADER_VERTEX] = 64, \
561 [MESA_SHADER_TESS_EVAL] = 34, \
562 }, \
563 .max_entries = { \
564 [MESA_SHADER_VERTEX] = 1856, \
565 [MESA_SHADER_TESS_CTRL] = 672, \
566 [MESA_SHADER_TESS_EVAL] = 1120, \
567 [MESA_SHADER_GEOMETRY] = 640, \
568 }, \
569 }
570
571 #define GFX9_LP_FEATURES \
572 GFX8_FEATURES, \
573 GFX9_HW_INFO, \
574 .has_integer_dword_mul = false, \
575 .gt = 1, \
576 .has_llc = false, \
577 .has_sample_with_hiz = true, \
578 .has_illegal_ccs_values = true, \
579 .num_slices = 1, \
580 .num_thread_per_eu = 6, \
581 .max_vs_threads = 112, \
582 .max_tcs_threads = 112, \
583 .max_tes_threads = 112, \
584 .max_gs_threads = 112, \
585 .max_cs_threads = 6 * 6, \
586 .timestamp_frequency = 19200000, \
587 .urb = { \
588 .min_entries = { \
589 [MESA_SHADER_VERTEX] = 34, \
590 [MESA_SHADER_TESS_EVAL] = 34, \
591 }, \
592 .max_entries = { \
593 [MESA_SHADER_VERTEX] = 704, \
594 [MESA_SHADER_TESS_CTRL] = 256, \
595 [MESA_SHADER_TESS_EVAL] = 416, \
596 [MESA_SHADER_GEOMETRY] = 256, \
597 }, \
598 }
599
600 #define GFX9_LP_FEATURES_3X6 \
601 GFX9_LP_FEATURES, \
602 .num_subslices = { 3, }, \
603 .max_eus_per_subslice = 6
604
605 #define GFX9_LP_FEATURES_2X6 \
606 GFX9_LP_FEATURES, \
607 .num_subslices = { 2, }, \
608 .max_eus_per_subslice = 6, \
609 .max_vs_threads = 56, \
610 .max_tcs_threads = 56, \
611 .max_tes_threads = 56, \
612 .max_gs_threads = 56, \
613 .max_cs_threads = 6 * 6, \
614 .urb = { \
615 .min_entries = { \
616 [MESA_SHADER_VERTEX] = 34, \
617 [MESA_SHADER_TESS_EVAL] = 34, \
618 }, \
619 .max_entries = { \
620 [MESA_SHADER_VERTEX] = 352, \
621 [MESA_SHADER_TESS_CTRL] = 128, \
622 [MESA_SHADER_TESS_EVAL] = 208, \
623 [MESA_SHADER_GEOMETRY] = 128, \
624 }, \
625 }
626
627 #define GFX9_FEATURES \
628 GFX8_FEATURES, \
629 GFX9_HW_INFO, \
630 .has_sample_with_hiz = true, \
631 .has_illegal_ccs_values = true, \
632 .cooperative_matrix_configurations = { \
633 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
634 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
635 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
636 }
637
638 static const struct intel_device_info intel_device_info_skl_gt1 = {
639 GFX9_FEATURES, .gt = 1,
640 .platform = INTEL_PLATFORM_SKL,
641 .num_slices = 1,
642 .num_subslices = { 2, },
643 .max_eus_per_subslice = 6,
644 .l3_banks = 2,
645 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
646 * leading to some vertices to go missing if we use too much URB.
647 */
648 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
649 .simulator_id = 12,
650 };
651
652 static const struct intel_device_info intel_device_info_skl_gt2 = {
653 GFX9_FEATURES, .gt = 2,
654 .platform = INTEL_PLATFORM_SKL,
655 .num_slices = 1,
656 .num_subslices = { 3, },
657 .max_eus_per_subslice = 8,
658 .l3_banks = 4,
659 .simulator_id = 12,
660 };
661
662 static const struct intel_device_info intel_device_info_skl_gt3 = {
663 GFX9_FEATURES, .gt = 3,
664 .platform = INTEL_PLATFORM_SKL,
665 .num_slices = 2,
666 .num_subslices = { 3, 3, },
667 .max_eus_per_subslice = 8,
668 .l3_banks = 8,
669 .simulator_id = 12,
670 };
671
672 static const struct intel_device_info intel_device_info_skl_gt4 = {
673 GFX9_FEATURES, .gt = 4,
674 .platform = INTEL_PLATFORM_SKL,
675 .num_slices = 3,
676 .num_subslices = { 3, 3, 3, },
677 .max_eus_per_subslice = 8,
678 .l3_banks = 12,
679 /* From the "L3 Allocation and Programming" documentation:
680 *
681 * "URB is limited to 1008KB due to programming restrictions. This is not a
682 * restriction of the L3 implementation, but of the FF and other clients.
683 * Therefore, in a GT4 implementation it is possible for the programmed
684 * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
685 * only 1008KB of this will be used."
686 */
687 .simulator_id = 12,
688 };
689
690 static const struct intel_device_info intel_device_info_bxt = {
691 GFX9_LP_FEATURES_3X6,
692 .platform = INTEL_PLATFORM_BXT,
693 .l3_banks = 2,
694 .simulator_id = 14,
695 };
696
697 static const struct intel_device_info intel_device_info_bxt_2x6 = {
698 GFX9_LP_FEATURES_2X6,
699 .platform = INTEL_PLATFORM_BXT,
700 .l3_banks = 1,
701 .simulator_id = 14,
702 };
703 /*
704 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
705 * There's no KBL entry. Using the default SKL (GFX9) GS entries value.
706 */
707
708 static const struct intel_device_info intel_device_info_kbl_gt1 = {
709 GFX9_FEATURES,
710 .platform = INTEL_PLATFORM_KBL,
711 .gt = 1,
712
713 .max_cs_threads = 7 * 6,
714 .num_slices = 1,
715 .num_subslices = { 2, },
716 .max_eus_per_subslice = 6,
717 .l3_banks = 2,
718 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
719 * leading to some vertices to go missing if we use too much URB.
720 */
721 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
722 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
723 .simulator_id = 16,
724 };
725
726 static const struct intel_device_info intel_device_info_kbl_gt1_5 = {
727 GFX9_FEATURES,
728 .platform = INTEL_PLATFORM_KBL,
729 .gt = 1,
730
731 .max_cs_threads = 7 * 6,
732 .num_slices = 1,
733 .num_subslices = { 3, },
734 .max_eus_per_subslice = 6,
735 .l3_banks = 4,
736 .simulator_id = 16,
737 };
738
739 static const struct intel_device_info intel_device_info_kbl_gt2 = {
740 GFX9_FEATURES,
741 .platform = INTEL_PLATFORM_KBL,
742 .gt = 2,
743
744 .num_slices = 1,
745 .num_subslices = { 3, },
746 .max_eus_per_subslice = 8,
747 .l3_banks = 4,
748 .simulator_id = 16,
749 };
750
751 static const struct intel_device_info intel_device_info_kbl_gt3 = {
752 GFX9_FEATURES,
753 .platform = INTEL_PLATFORM_KBL,
754 .gt = 3,
755
756 .num_slices = 2,
757 .num_subslices = { 3, 3, },
758 .max_eus_per_subslice = 8,
759 .l3_banks = 8,
760 .simulator_id = 16,
761 };
762
763 static const struct intel_device_info intel_device_info_kbl_gt4 = {
764 GFX9_FEATURES,
765 .platform = INTEL_PLATFORM_KBL,
766 .gt = 4,
767
768 /*
769 * From the "L3 Allocation and Programming" documentation:
770 *
771 * "URB is limited to 1008KB due to programming restrictions. This
772 * is not a restriction of the L3 implementation, but of the FF and
773 * other clients. Therefore, in a GT4 implementation it is
774 * possible for the programmed allocation of the L3 data array to
775 * provide 3*384KB=1152KB for URB, but only 1008KB of this
776 * will be used."
777 */
778 .num_slices = 3,
779 .num_subslices = { 3, 3, 3, },
780 .max_eus_per_subslice = 8,
781 .l3_banks = 12,
782 .simulator_id = 16,
783 };
784
785 static const struct intel_device_info intel_device_info_glk = {
786 GFX9_LP_FEATURES_3X6,
787 .platform = INTEL_PLATFORM_GLK,
788 .l3_banks = 2,
789 .simulator_id = 17,
790 };
791
792 static const struct intel_device_info intel_device_info_glk_2x6 = {
793 GFX9_LP_FEATURES_2X6,
794 .platform = INTEL_PLATFORM_GLK,
795 .l3_banks = 2,
796 .simulator_id = 17,
797 };
798
799 static const struct intel_device_info intel_device_info_cfl_gt1 = {
800 GFX9_FEATURES,
801 .platform = INTEL_PLATFORM_CFL,
802 .gt = 1,
803
804 .num_slices = 1,
805 .num_subslices = { 2, },
806 .max_eus_per_subslice = 6,
807 .l3_banks = 2,
808 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
809 * leading to some vertices to go missing if we use too much URB.
810 */
811 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
812 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
813 .simulator_id = 24,
814 };
815 static const struct intel_device_info intel_device_info_cfl_gt2 = {
816 GFX9_FEATURES,
817 .platform = INTEL_PLATFORM_CFL,
818 .gt = 2,
819
820 .num_slices = 1,
821 .num_subslices = { 3, },
822 .max_eus_per_subslice = 8,
823 .l3_banks = 4,
824 .simulator_id = 24,
825 };
826
827 static const struct intel_device_info intel_device_info_cfl_gt3 = {
828 GFX9_FEATURES,
829 .platform = INTEL_PLATFORM_CFL,
830 .gt = 3,
831
832 .num_slices = 2,
833 .num_subslices = { 3, 3, },
834 .max_eus_per_subslice = 8,
835 .l3_banks = 8,
836 .simulator_id = 24,
837 };
838
839 #define subslices(args...) { args, }
840
841 #define GFX11_HW_INFO \
842 .ver = 11, \
843 .has_pln = false, \
844 .max_vs_threads = 364, \
845 .max_gs_threads = 224, \
846 .max_tcs_threads = 224, \
847 .max_tes_threads = 364, \
848 .max_threads_per_psd = 64, \
849 .max_cs_threads = 56
850
851 #define GFX11_FEATURES(_gt, _slices, _subslices, _l3, _platform) \
852 GFX8_FEATURES, \
853 GFX11_HW_INFO, \
854 .platform = _platform, \
855 .has_64bit_float = false, \
856 .has_64bit_int = false, \
857 .has_integer_dword_mul = false, \
858 .has_sample_with_hiz = false, \
859 .has_illegal_ccs_values = true, \
860 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
861 .num_subslices = _subslices, \
862 .max_eus_per_subslice = 8, \
863 .cooperative_matrix_configurations = { \
864 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
865 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
866 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
867 }
868
869 #define GFX11_URB_MIN_MAX_ENTRIES \
870 .min_entries = { \
871 [MESA_SHADER_VERTEX] = 64, \
872 [MESA_SHADER_TESS_EVAL] = 34, \
873 }, \
874 .max_entries = { \
875 [MESA_SHADER_VERTEX] = 2384, \
876 [MESA_SHADER_TESS_CTRL] = 1032, \
877 [MESA_SHADER_TESS_EVAL] = 2384, \
878 [MESA_SHADER_GEOMETRY] = 1032, \
879 }
880
881 static const struct intel_device_info intel_device_info_icl_gt2 = {
882 GFX11_FEATURES(2, 1, subslices(8), 8, INTEL_PLATFORM_ICL),
883 .urb = {
884 GFX11_URB_MIN_MAX_ENTRIES,
885 },
886 .simulator_id = 19,
887 };
888
889 static const struct intel_device_info intel_device_info_icl_gt1_5 = {
890 GFX11_FEATURES(1, 1, subslices(6), 6, INTEL_PLATFORM_ICL),
891 .urb = {
892 GFX11_URB_MIN_MAX_ENTRIES,
893 },
894 .simulator_id = 19,
895 };
896
897 static const struct intel_device_info intel_device_info_icl_gt1 = {
898 GFX11_FEATURES(1, 1, subslices(4), 6, INTEL_PLATFORM_ICL),
899 .urb = {
900 GFX11_URB_MIN_MAX_ENTRIES,
901 },
902 .simulator_id = 19,
903 };
904
905 static const struct intel_device_info intel_device_info_icl_gt0_5 = {
906 GFX11_FEATURES(1, 1, subslices(1), 6, INTEL_PLATFORM_ICL),
907 .urb = {
908 GFX11_URB_MIN_MAX_ENTRIES,
909 },
910 .simulator_id = 19,
911 };
912
913 #define GFX11_LP_FEATURES \
914 .urb = { \
915 GFX11_URB_MIN_MAX_ENTRIES, \
916 }, \
917 .disable_ccs_repack = true, \
918 .has_illegal_ccs_values = true, \
919 .simulator_id = 28
920
921 static const struct intel_device_info intel_device_info_ehl_4x8 = {
922 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
923 GFX11_LP_FEATURES,
924 };
925
926 static const struct intel_device_info intel_device_info_ehl_4x6 = {
927 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
928 GFX11_LP_FEATURES,
929 .max_eus_per_subslice = 6,
930 };
931
932 static const struct intel_device_info intel_device_info_ehl_4x5 = {
933 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
934 GFX11_LP_FEATURES,
935 .max_eus_per_subslice = 5,
936 };
937
938 static const struct intel_device_info intel_device_info_ehl_4x4 = {
939 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
940 GFX11_LP_FEATURES,
941 .max_eus_per_subslice = 4,
942 };
943
944 static const struct intel_device_info intel_device_info_ehl_2x8 = {
945 GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
946 GFX11_LP_FEATURES,
947 };
948
949 static const struct intel_device_info intel_device_info_ehl_2x4 = {
950 GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
951 GFX11_LP_FEATURES,
952 .max_eus_per_subslice = 4,
953 };
954
955 #define GFX12_HW_INFO \
956 .ver = 12, \
957 .has_pln = false, \
958 .has_sample_with_hiz = false, \
959 .has_aux_map = true, \
960 .max_vs_threads = 546, \
961 .max_gs_threads = 336, \
962 .max_tcs_threads = 336, \
963 .max_tes_threads = 546, \
964 .max_threads_per_psd = 64, \
965 .max_cs_threads = 112, /* threads per DSS */ \
966 .urb = { \
967 .size = 512, /* For intel_stub_gpu */ \
968 .min_entries = { \
969 [MESA_SHADER_VERTEX] = 64, \
970 [MESA_SHADER_TESS_EVAL] = 34, \
971 }, \
972 .max_entries = { \
973 [MESA_SHADER_VERTEX] = 3576, \
974 [MESA_SHADER_TESS_CTRL] = 1548, \
975 [MESA_SHADER_TESS_EVAL] = 3576, \
976 [MESA_SHADER_GEOMETRY] = 1548, \
977 }, \
978 }
979
980 #define GFX12_FEATURES(_gt, _slices, _l3) \
981 GFX8_FEATURES, \
982 GFX12_HW_INFO, \
983 .has_64bit_float = false, \
984 .has_64bit_int = false, \
985 .has_integer_dword_mul = false, \
986 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
987 .simulator_id = 22, \
988 .max_eus_per_subslice = 16, \
989 /* BSpec 45101 (r51017) */ \
990 .pat = { \
991 /* CPU: WB, GPU: PAT 0 => WB, 2WAY */ \
992 .cached_coherent = PAT_ENTRY(0, WB), \
993 /* CPU: WC, GPU: PAT 1 => WC */ \
994 .scanout = PAT_ENTRY(1, WC), \
995 /* CPU: WB, GPU: PAT 0 => WB, 2WAY */ \
996 .writeback_incoherent = PAT_ENTRY(0, WB), \
997 /* CPU: WC, GPU: PAT 1 => WC */ \
998 .writecombining = PAT_ENTRY(1, WC), \
999 }, \
1000 .cooperative_matrix_configurations = { \
1001 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
1002 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
1003 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
1004 }
1005
1006 #define dual_subslices(args...) { args, }
1007
1008 #define GFX12_GT05_FEATURES \
1009 GFX12_FEATURES(1, 1, 4), \
1010 .num_subslices = dual_subslices(1)
1011
1012 #define GFX12_GT_FEATURES(_gt) \
1013 GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8), \
1014 .num_subslices = dual_subslices(_gt == 1 ? 2 : 6)
1015
1016 static const struct intel_device_info intel_device_info_tgl_gt1 = {
1017 GFX12_GT_FEATURES(1),
1018 .platform = INTEL_PLATFORM_TGL,
1019 };
1020
1021 static const struct intel_device_info intel_device_info_tgl_gt2 = {
1022 GFX12_GT_FEATURES(2),
1023 .platform = INTEL_PLATFORM_TGL,
1024 };
1025
1026 static const struct intel_device_info intel_device_info_rkl_gt05 = {
1027 GFX12_GT05_FEATURES,
1028 .platform = INTEL_PLATFORM_RKL,
1029 };
1030
1031 static const struct intel_device_info intel_device_info_rkl_gt1 = {
1032 GFX12_GT_FEATURES(1),
1033 .platform = INTEL_PLATFORM_RKL,
1034 };
1035
1036 static const struct intel_device_info intel_device_info_adl_gt05 = {
1037 GFX12_GT05_FEATURES,
1038 .platform = INTEL_PLATFORM_ADL,
1039 };
1040
1041 static const struct intel_device_info intel_device_info_adl_gt1 = {
1042 GFX12_GT_FEATURES(1),
1043 .platform = INTEL_PLATFORM_ADL,
1044 };
1045
1046 static const struct intel_device_info intel_device_info_adl_n = {
1047 GFX12_GT_FEATURES(1),
1048 .platform = INTEL_PLATFORM_ADL,
1049 .is_adl_n = true,
1050 };
1051
1052 static const struct intel_device_info intel_device_info_adl_gt2 = {
1053 GFX12_GT_FEATURES(2),
1054 .platform = INTEL_PLATFORM_ADL,
1055 };
1056
1057 static const struct intel_device_info intel_device_info_rpl = {
1058 GFX12_FEATURES(1, 1, 4),
1059 .num_subslices = dual_subslices(2),
1060 .platform = INTEL_PLATFORM_RPL,
1061 };
1062
1063 static const struct intel_device_info intel_device_info_rpl_p = {
1064 GFX12_GT_FEATURES(2),
1065 .platform = INTEL_PLATFORM_RPL,
1066 };
1067
1068 #define GFX12_DG1_SG1_FEATURES \
1069 GFX12_GT_FEATURES(2), \
1070 .platform = INTEL_PLATFORM_DG1, \
1071 .has_llc = false, \
1072 .has_local_mem = true, \
1073 .urb.size = 768, \
1074 .simulator_id = 30
1075
1076 static const struct intel_device_info intel_device_info_dg1 = {
1077 GFX12_DG1_SG1_FEATURES,
1078 };
1079
1080 static const struct intel_device_info intel_device_info_sg1 = {
1081 GFX12_DG1_SG1_FEATURES,
1082 };
1083
1084 #define XEHP_URB_MIN_MAX_ENTRIES \
1085 .min_entries = { \
1086 [MESA_SHADER_VERTEX] = 64, \
1087 [MESA_SHADER_TESS_EVAL] = 34, \
1088 }, \
1089 .max_entries = { \
1090 [MESA_SHADER_VERTEX] = 3832, /* BSpec 47138 */ \
1091 [MESA_SHADER_TESS_CTRL] = 1548, /* BSpec 47137 */ \
1092 [MESA_SHADER_TESS_EVAL] = 3576, /* BSpec 47135 */ \
1093 [MESA_SHADER_GEOMETRY] = 1548, /* BSpec 47136 */ \
1094 }
1095
1096 #define XEHP_FEATURES(_gt, _slices, _l3) \
1097 GFX8_FEATURES, \
1098 .needs_null_push_constant_tbimr_workaround = true, \
1099 .has_64bit_float = false, \
1100 .has_64bit_int = false, \
1101 .has_integer_dword_mul = false, \
1102 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
1103 .num_subslices = dual_subslices(1), /* updated by topology */\
1104 .ver = 12, \
1105 .has_pln = false, \
1106 .has_sample_with_hiz = false, \
1107 .max_vs_threads = 546, /* BSpec 46312 */ \
1108 .max_gs_threads = 336, /* BSpec 46299 */ \
1109 .max_tcs_threads = 336, /* BSpec 46300 */ \
1110 .max_tes_threads = 546, /* BSpec 46298 */ \
1111 .max_threads_per_psd = 64, \
1112 .max_cs_threads = 112, /* threads per DSS */ \
1113 .urb = { \
1114 .size = 768, /* For intel_stub_gpu */ \
1115 XEHP_URB_MIN_MAX_ENTRIES, \
1116 }, \
1117 .num_thread_per_eu = 8 /* BSpec 44472 */, \
1118 .max_eus_per_subslice = 16, \
1119 .verx10 = 125, \
1120 .has_llc = false, \
1121 .has_lsc = true, \
1122 .has_local_mem = true, \
1123 .has_aux_map = false, \
1124 .simulator_id = 29, \
1125 .cooperative_matrix_configurations = { \
1126 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
1127 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
1128 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
1129 }
1130
1131 #define DG2_FEATURES \
1132 /* (Sub)slice info comes from the kernel topology info */ \
1133 XEHP_FEATURES(0, 1, 0), \
1134 .revision = 4, /* For offline compiler */ \
1135 .has_coarse_pixel_primitive_and_cb = true, \
1136 .has_mesh_shading = true, \
1137 .has_ray_tracing = true, \
1138 .has_flat_ccs = true, \
1139 /* There is no PAT table for DG2, using TGL ones */ \
1140 /* BSpec 45101 (r51017) */ \
1141 .pat = { \
1142 /* CPU: WB, GPU: PAT 0 => WB, 2WAY */ \
1143 .cached_coherent = PAT_ENTRY(0, WB), \
1144 /* CPU: WC, GPU: PAT 1 => WC */ \
1145 .scanout = PAT_ENTRY(1, WC), \
1146 /* CPU: WB, GPU: PAT 0 => WB, 2WAY */ \
1147 .writeback_incoherent = PAT_ENTRY(0, WB), \
1148 /* CPU: WC, GPU: PAT 1 => WC */ \
1149 .writecombining = PAT_ENTRY(1, WC), \
1150 }
1151
1152 static const struct intel_device_info intel_device_info_dg2_g10 = {
1153 DG2_FEATURES,
1154 .platform = INTEL_PLATFORM_DG2_G10,
1155 };
1156
1157 static const struct intel_device_info intel_device_info_dg2_g11 = {
1158 DG2_FEATURES,
1159 .platform = INTEL_PLATFORM_DG2_G11,
1160 };
1161
1162 static const struct intel_device_info intel_device_info_dg2_g12 = {
1163 DG2_FEATURES,
1164 .platform = INTEL_PLATFORM_DG2_G12,
1165 };
1166
1167 static const struct intel_device_info intel_device_info_atsm_g10 = {
1168 DG2_FEATURES,
1169 .platform = INTEL_PLATFORM_ATSM_G10,
1170 };
1171
1172 static const struct intel_device_info intel_device_info_atsm_g11 = {
1173 DG2_FEATURES,
1174 .platform = INTEL_PLATFORM_ATSM_G11,
1175 };
1176
1177 #define MTL_FEATURES \
1178 /* (Sub)slice info comes from the kernel topology info */ \
1179 XEHP_FEATURES(0, 1, 0), \
1180 .has_local_mem = false, \
1181 .has_aux_map = true, \
1182 .has_64bit_float = true, \
1183 .has_64bit_float_via_math_pipe = true, \
1184 .has_integer_dword_mul = false, \
1185 .has_coarse_pixel_primitive_and_cb = true, \
1186 .has_mesh_shading = true, \
1187 .has_ray_tracing = true, \
1188 /* BSpec 45101 (r51017) */ \
1189 .pat = { \
1190 /* CPU: WB, GPU: PAT 3 => WB, 1WAY */ \
1191 .cached_coherent = PAT_ENTRY(3, WB), \
1192 /* CPU: WC, GPU: PAT 1 => WC */ \
1193 .scanout = PAT_ENTRY(1, WC), \
1194 /* CPU: WB, GPU: PAT 0 => WB, 0WAY */ \
1195 .writeback_incoherent = PAT_ENTRY(0, WB), \
1196 /* CPU: WC, GPU: PAT 1 => WC */ \
1197 .writecombining = PAT_ENTRY(1, WC), \
1198 }
1199
1200 static const struct intel_device_info intel_device_info_mtl_u = {
1201 MTL_FEATURES,
1202 .platform = INTEL_PLATFORM_MTL_U,
1203 };
1204
1205 static const struct intel_device_info intel_device_info_mtl_h = {
1206 MTL_FEATURES,
1207 .platform = INTEL_PLATFORM_MTL_H,
1208 };
1209
1210 static const struct intel_device_info intel_device_info_arl_u = {
1211 MTL_FEATURES,
1212 .platform = INTEL_PLATFORM_ARL_U,
1213 };
1214
1215 static const struct intel_device_info intel_device_info_arl_h = {
1216 MTL_FEATURES,
1217 .platform = INTEL_PLATFORM_ARL_H,
1218 };
1219
1220 #define XE2_FEATURES \
1221 /* (Sub)slice info comes from the kernel topology info */ \
1222 XEHP_FEATURES(0, 1, 0), \
1223 .ver = 20, \
1224 .verx10 = 200, \
1225 .num_subslices = dual_subslices(1), \
1226 .grf_size = 64, \
1227 .needs_null_push_constant_tbimr_workaround = false, \
1228 .has_64bit_float = true, \
1229 .has_64bit_int = true, \
1230 .has_integer_dword_mul = false, \
1231 .has_coarse_pixel_primitive_and_cb = true, \
1232 .has_mesh_shading = true, \
1233 .has_ray_tracing = true, \
1234 .has_indirect_unroll = true, \
1235 /* BSpec 71582 (r59285) */ \
1236 .pat = { \
1237 /* CPU: WB, GPU: PAT 1 => WB, 1WAY */ \
1238 .cached_coherent = PAT_ENTRY(1, WB), \
1239 /* CPU: WC, GPU: PAT 6 => XD */ \
1240 .scanout = PAT_ENTRY(6, WC), \
1241 /* CPU: WC, GPU: PAT 0 => WB */ \
1242 .writecombining = PAT_ENTRY(0, WC), \
1243 /* CPU: WC, GPU: PAT 11 => XD, compressed */ \
1244 .compressed = PAT_ENTRY(11, WC) \
1245 }, \
1246 .cooperative_matrix_configurations = { \
1247 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 16, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
1248 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 16, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
1249 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 16, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
1250 }, \
1251 .has_flat_ccs = true
1252
1253 static const struct intel_device_info intel_device_info_bmg = {
1254 XE2_FEATURES,
1255 .platform = INTEL_PLATFORM_BMG,
1256 .has_local_mem = true,
1257 };
1258
1259 static const struct intel_device_info intel_device_info_lnl = {
1260 XE2_FEATURES,
1261 .platform = INTEL_PLATFORM_LNL,
1262 .has_local_mem = false,
1263 };
1264
1265 #define XE3_FEATURES \
1266 XE2_FEATURES, \
1267 .ver = 30, \
1268 .verx10 = 300
1269
1270 static const struct intel_device_info intel_device_info_ptl = {
1271 XE3_FEATURES,
1272 .platform = INTEL_PLATFORM_PTL,
1273 .has_local_mem = false,
1274 };
1275
1276 void
intel_device_info_topology_reset_masks(struct intel_device_info * devinfo)1277 intel_device_info_topology_reset_masks(struct intel_device_info *devinfo)
1278 {
1279 devinfo->subslice_slice_stride = 0;
1280 devinfo->eu_subslice_stride = 0;
1281 devinfo->eu_slice_stride = 0;
1282
1283 devinfo->num_slices = 0;
1284 memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices));
1285
1286 memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks));
1287 memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks));
1288 memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks));
1289 memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices));
1290 }
1291
1292 void
intel_device_info_topology_update_counts(struct intel_device_info * devinfo)1293 intel_device_info_topology_update_counts(struct intel_device_info *devinfo)
1294 {
1295 devinfo->num_slices = __builtin_popcount(devinfo->slice_masks);
1296 devinfo->subslice_total = 0;
1297 for (int s = 0; s < devinfo->max_slices; s++) {
1298 if (!intel_device_info_slice_available(devinfo, s))
1299 continue;
1300
1301 for (int b = 0; b < devinfo->subslice_slice_stride; b++) {
1302 devinfo->num_subslices[s] +=
1303 __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]);
1304 }
1305 devinfo->subslice_total += devinfo->num_subslices[s];
1306 }
1307 assert(devinfo->num_slices > 0);
1308 assert(devinfo->subslice_total > 0);
1309 }
1310
1311 void
intel_device_info_update_pixel_pipes(struct intel_device_info * devinfo,uint8_t * subslice_masks)1312 intel_device_info_update_pixel_pipes(struct intel_device_info *devinfo, uint8_t *subslice_masks)
1313 {
1314 if (devinfo->ver < 11)
1315 return;
1316
1317 /* The kernel only reports one slice on all existing ICL+ platforms, even
1318 * if multiple slices are present. The slice mask is allowed to have the
1319 * accurate value greater than 1 on gfx12.5+ platforms though, in order to
1320 * be tolerant with the behavior of our simulation environment.
1321 */
1322 assert(devinfo->slice_masks == 1 || devinfo->verx10 >= 125);
1323
1324 /* Count the number of subslices on each pixel pipe. Assume that every
1325 * contiguous group of 4 subslices in the mask belong to the same pixel
1326 * pipe. However note that on TGL+ the kernel returns a mask of enabled
1327 * *dual* subslices instead of actual subslices somewhat confusingly, so
1328 * each pixel pipe only takes 2 bits in the mask even though it's still 4
1329 * subslices.
1330 */
1331 const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4;
1332 for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) {
1333 const unsigned offset = p * ppipe_bits;
1334 const unsigned subslice_idx = offset /
1335 devinfo->max_subslices_per_slice * devinfo->subslice_slice_stride;
1336 const unsigned ppipe_mask =
1337 BITFIELD_RANGE(offset % devinfo->max_subslices_per_slice, ppipe_bits);
1338
1339 if (subslice_idx < ARRAY_SIZE(devinfo->subslice_masks))
1340 devinfo->ppipe_subslices[p] =
1341 __builtin_popcount(subslice_masks[subslice_idx] & ppipe_mask);
1342 else
1343 devinfo->ppipe_subslices[p] = 0;
1344 }
1345 }
1346
1347 void
intel_device_info_update_l3_banks(struct intel_device_info * devinfo)1348 intel_device_info_update_l3_banks(struct intel_device_info *devinfo)
1349 {
1350 if (devinfo->ver != 12)
1351 return;
1352
1353 if (devinfo->verx10 >= 125) {
1354 if (devinfo->subslice_total > 16) {
1355 assert(devinfo->subslice_total <= 32);
1356 devinfo->l3_banks = 32;
1357 } else if (devinfo->subslice_total > 8) {
1358 devinfo->l3_banks = 16;
1359 } else {
1360 devinfo->l3_banks = 8;
1361 }
1362 } else {
1363 assert(devinfo->num_slices == 1);
1364 if (devinfo->subslice_total >= 6) {
1365 assert(devinfo->subslice_total == 6);
1366 devinfo->l3_banks = 8;
1367 } else if (devinfo->subslice_total > 2) {
1368 devinfo->l3_banks = 6;
1369 } else {
1370 devinfo->l3_banks = 4;
1371 }
1372 }
1373 }
1374
1375 /* Returns the number of EUs of the first subslice enabled */
1376 uint32_t
intel_device_info_get_eu_count_first_subslice(const struct intel_device_info * devinfo)1377 intel_device_info_get_eu_count_first_subslice(const struct intel_device_info *devinfo)
1378 {
1379 uint32_t first_subslice, first_slice, offset, i;
1380 uint32_t eu_count = 0;
1381
1382 first_slice = ffs(devinfo->slice_masks);
1383 first_slice--;
1384 offset = first_slice * devinfo->subslice_slice_stride;
1385
1386 for (i = 0; i < DIV_ROUND_UP(devinfo->max_subslices_per_slice, 8); i++) {
1387 first_subslice = ffs(devinfo->subslice_masks[offset + i]);
1388
1389 if (first_subslice == 0)
1390 continue;
1391
1392 break;
1393 }
1394
1395 assert(first_subslice > 0);
1396 first_subslice--;
1397 offset = first_slice * devinfo->eu_slice_stride +
1398 first_subslice * devinfo->eu_subslice_stride;
1399 for (i = 0; i < DIV_ROUND_UP(devinfo->max_eus_per_subslice, 8); i++)
1400 eu_count += __builtin_popcount(devinfo->eu_masks[offset + i]);
1401
1402 assert(eu_count > 0);
1403 return eu_count;
1404 }
1405
1406 /* Generate mask from the device data. */
1407 static void
fill_masks(struct intel_device_info * devinfo)1408 fill_masks(struct intel_device_info *devinfo)
1409 {
1410 /* All of our internal device descriptions assign the same number of
1411 * subslices for each slice. Just verify that this is true.
1412 */
1413 for (int s = 1; s < devinfo->num_slices; s++)
1414 assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]);
1415
1416 intel_device_info_i915_update_from_masks(devinfo,
1417 (1U << devinfo->num_slices) - 1,
1418 (1U << devinfo->num_subslices[0]) - 1,
1419 devinfo->num_slices * devinfo->num_subslices[0] *
1420 devinfo->max_eus_per_subslice);
1421 }
1422
1423 void
intel_device_info_update_cs_workgroup_threads(struct intel_device_info * devinfo)1424 intel_device_info_update_cs_workgroup_threads(struct intel_device_info *devinfo)
1425 {
1426 /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
1427 * can program is 64 without going up to a rectangular group. This only
1428 * impacts Haswell and TGL which have higher thread counts.
1429 *
1430 * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
1431 * is 10 bits so we have no such restrictions.
1432 */
1433 devinfo->max_cs_workgroup_threads =
1434 devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
1435 MIN2(devinfo->max_cs_threads, 64);
1436 }
1437
1438 static bool
parse_force_probe_entry(int pci_id,const char * entry,bool * force_on,bool * force_off)1439 parse_force_probe_entry(int pci_id, const char *entry, bool *force_on,
1440 bool *force_off)
1441 {
1442 const char *cp = entry;
1443
1444 bool negated = *cp == '!';
1445 if (negated)
1446 cp++;
1447
1448 if (*cp == '\0')
1449 return false;
1450
1451 bool wildcard = *cp == '*';
1452 long val = 0;
1453
1454 if (wildcard) {
1455 cp++;
1456 } else {
1457 char *end;
1458 val = strtol(cp, &end, 16);
1459 if (end == cp)
1460 return false;
1461 cp = end;
1462 }
1463
1464 if (*cp != '\0')
1465 return false;
1466
1467 bool matched = wildcard || (long)pci_id == val;
1468 if (matched) {
1469 *force_on = !negated;
1470 *force_off = negated;
1471 }
1472
1473 return matched;
1474 }
1475
1476 static void
scan_for_force_probe(int pci_id,bool * force_on,bool * force_off)1477 scan_for_force_probe(int pci_id, bool *force_on, bool *force_off)
1478 {
1479 *force_on = false;
1480 *force_off = false;
1481
1482 const char *env = getenv("INTEL_FORCE_PROBE");
1483 if (env == NULL)
1484 return;
1485
1486 size_t len = strlen(env);
1487 if (len == 0)
1488 return;
1489
1490 char *dup = strndup(env, len);
1491 if (dup == NULL)
1492 return;
1493
1494 for (char *entry = strtok(dup, ","); entry; entry = strtok(NULL, ","))
1495 parse_force_probe_entry(pci_id, entry, force_on, force_off);
1496
1497 free(dup);
1498 assert(!*force_on || !*force_off);
1499 }
1500
1501 struct device_init_config {
1502 bool require_force_probe;
1503 };
1504
1505 /* Example PCI ID entry using FORCE_PROBE:
1506 *
1507 * CHIPSET(0x1234, foo, "FOO", "Intel(R) Graphics", FORCE_PROBE)
1508 */
1509 #define FORCE_PROBE .require_force_probe = true
1510
1511 static bool
intel_device_info_init_common(int pci_id,bool building,struct intel_device_info * devinfo)1512 intel_device_info_init_common(int pci_id, bool building,
1513 struct intel_device_info *devinfo)
1514 {
1515 struct device_init_config device_config = { 0 };
1516 switch (pci_id) {
1517 #undef CHIPSET
1518 #define CHIPSET(id, family, fam_str, name, ...) \
1519 case id: \
1520 *devinfo = intel_device_info_##family; \
1521 device_config = *&(struct device_init_config) { __VA_ARGS__ }; \
1522 break;
1523 #include "pci_ids/crocus_pci_ids.h"
1524 #include "pci_ids/iris_pci_ids.h"
1525
1526 #undef CHIPSET
1527 #define CHIPSET(id, fam_str, name) \
1528 case id: *devinfo = intel_device_info_gfx3; break;
1529 #include "pci_ids/i915_pci_ids.h"
1530
1531 default:
1532 mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id);
1533 return false;
1534 }
1535
1536 switch (pci_id) {
1537 #undef CHIPSET
1538 #define CHIPSET(_id, _family, _fam_str, _name, ...) \
1539 case _id: \
1540 /* sizeof(str_literal) includes the null */ \
1541 STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \
1542 sizeof(devinfo->name)); \
1543 strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \
1544 break;
1545 #include "pci_ids/crocus_pci_ids.h"
1546 #include "pci_ids/iris_pci_ids.h"
1547 default:
1548 strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name));
1549 }
1550
1551 bool force_on = false;
1552 bool force_off = false;
1553 if (building)
1554 force_on = true;
1555 else
1556 scan_for_force_probe(pci_id, &force_on, &force_off);
1557 devinfo->probe_forced = force_on;
1558 if (force_off) {
1559 mesa_logw("%s (0x%x) disabled with INTEL_FORCE_PROBE", devinfo->name,
1560 pci_id);
1561 return false;
1562 } else if (device_config.require_force_probe) {
1563 if (force_on) {
1564 if (!building)
1565 mesa_logw("Forcing probe of unsupported: %s (0x%x)", devinfo->name,
1566 pci_id);
1567 } else {
1568 mesa_loge("%s (0x%x) requires INTEL_FORCE_PROBE", devinfo->name,
1569 pci_id);
1570 return false;
1571 }
1572 }
1573
1574 devinfo->pci_device_id = pci_id;
1575
1576 fill_masks(devinfo);
1577
1578 /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
1579 *
1580 * "Scratch Space per slice is computed based on 4 sub-slices. SW must
1581 * allocate scratch space enough so that each slice has 4 slices allowed."
1582 *
1583 * The equivalent internal documentation says that this programming note
1584 * applies to all Gfx9+ platforms.
1585 *
1586 * The hardware typically calculates the scratch space pointer by taking
1587 * the base address, and adding per-thread-scratch-space * thread ID.
1588 * Extra padding can be necessary depending how the thread IDs are
1589 * calculated for a particular shader stage.
1590 */
1591
1592 switch(devinfo->ver) {
1593 case 9:
1594 devinfo->max_wm_threads = 64 /* threads-per-PSD */
1595 * devinfo->num_slices
1596 * 4; /* effective subslices per slice */
1597 break;
1598 case 11:
1599 case 12:
1600 case 20:
1601 case 30:
1602 devinfo->max_wm_threads = 128 /* threads-per-PSD */
1603 * devinfo->num_slices
1604 * 8; /* subslices per slice */
1605 break;
1606 default:
1607 assert(devinfo->ver < 9);
1608 break;
1609 }
1610
1611 assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
1612
1613 if (devinfo->verx10 == 0)
1614 devinfo->verx10 = devinfo->ver * 10;
1615
1616 uint16_t major = devinfo->ver;
1617 uint16_t minor = (devinfo->verx10 - (devinfo->ver * 10)) * 10;
1618 /* When supported gfx_ip_ver will be overwritten by values read from KMD.
1619 * This is a approximation for platforms that do not support GMD ID or
1620 * when running offline tools.
1621 * verx10 125 becomes GFX_IP_VER(12, 50) for example.
1622 */
1623 devinfo->gfx_ip_ver = GFX_IP_VER(major, minor);
1624
1625 if (devinfo->has_mesh_shading) {
1626 /* Half of push constant space matches the size used in the simplest
1627 * primitive pipeline (VS + FS). Tweaking this affects performance.
1628 */
1629 devinfo->mesh_max_constant_urb_size_kb =
1630 devinfo->max_constant_urb_size_kb / 2;
1631 }
1632
1633 /*
1634 * Gfx 12.5 moved scratch to a surface and SURFTYPE_SCRATCH has this pitch
1635 * restriction:
1636 *
1637 * BSpec 43862 (r52666)
1638 * RENDER_SURFACE_STATE::Surface Pitch
1639 * For surfaces of type SURFTYPE_SCRATCH, valid range of pitch is:
1640 * [63,262143] -> [64B, 256KB]
1641 *
1642 * The pitch of the surface is the scratch size per thread and the surface
1643 * should be large enough to accommodate every physical thread.
1644 */
1645 devinfo->max_scratch_size_per_thread = devinfo->verx10 >= 125 ?
1646 (256 * 1024) : (2 * 1024 * 1024);
1647 intel_device_info_update_cs_workgroup_threads(devinfo);
1648
1649 return true;
1650 }
1651
1652 static void
intel_device_info_apply_workarounds(struct intel_device_info * devinfo)1653 intel_device_info_apply_workarounds(struct intel_device_info *devinfo)
1654 {
1655 if (intel_needs_workaround(devinfo, 18012660806))
1656 devinfo->urb.max_entries[MESA_SHADER_GEOMETRY] = 1536;
1657
1658 if (intel_needs_workaround(devinfo, 18040209780))
1659 devinfo->max_gs_threads = 312;
1660
1661 /* Fixes issues with:
1662 * dEQP-GLES31.functional.geometry_shading.layered.render_with_default_layer_cubemap
1663 * when running on GFX12 platforms with small EU count.
1664 */
1665 const uint32_t eu_total = intel_device_info_eu_total(devinfo);
1666 if (devinfo->verx10 == 120 && eu_total <= 32)
1667 devinfo->urb.max_entries[MESA_SHADER_GEOMETRY] = 1024;
1668 }
1669
1670 static bool
intel_get_device_info_from_pci_id_common(int pci_id,bool building,struct intel_device_info * devinfo)1671 intel_get_device_info_from_pci_id_common(int pci_id, bool building,
1672 struct intel_device_info *devinfo)
1673 {
1674 intel_device_info_init_common(pci_id, building, devinfo);
1675
1676 /* This is a placeholder until a proper value is set. */
1677 devinfo->kmd_type = INTEL_KMD_TYPE_I915;
1678
1679 intel_device_info_init_was(devinfo);
1680 intel_device_info_apply_workarounds(devinfo);
1681
1682 return true;
1683 }
1684
1685 bool
intel_get_device_info_from_pci_id(int pci_id,struct intel_device_info * devinfo)1686 intel_get_device_info_from_pci_id(int pci_id,
1687 struct intel_device_info *devinfo)
1688 {
1689 return intel_get_device_info_from_pci_id_common(pci_id, false, devinfo);
1690 }
1691
1692 bool
intel_get_device_info_for_build(int pci_id,struct intel_device_info * devinfo)1693 intel_get_device_info_for_build(int pci_id,
1694 struct intel_device_info *devinfo)
1695 {
1696 return intel_get_device_info_from_pci_id_common(pci_id, true, devinfo);
1697 }
1698
1699 bool
intel_device_info_compute_system_memory(struct intel_device_info * devinfo,bool update)1700 intel_device_info_compute_system_memory(struct intel_device_info *devinfo, bool update)
1701 {
1702 if (!update) {
1703 if (!os_get_total_physical_memory(&devinfo->mem.sram.mappable.size))
1704 return false;
1705 }
1706
1707 os_get_available_system_memory(&devinfo->mem.sram.mappable.free);
1708
1709 return true;
1710 }
1711
1712 static void
intel_device_info_adjust_memory(struct intel_device_info * devinfo)1713 intel_device_info_adjust_memory(struct intel_device_info *devinfo)
1714 {
1715 uint64_t available;
1716
1717 /* Applications running without elevated privileges don't report valid
1718 * numbers for free sram
1719 */
1720 if (os_get_available_system_memory(&available)) {
1721 devinfo->mem.sram.mappable.free = MIN3(devinfo->mem.sram.mappable.free,
1722 devinfo->mem.sram.mappable.size,
1723 available);
1724 }
1725 }
1726
1727 static void
init_max_scratch_ids(struct intel_device_info * devinfo)1728 init_max_scratch_ids(struct intel_device_info *devinfo)
1729 {
1730 /* Determine the max number of subslices that potentially might be used in
1731 * scratch space ids.
1732 *
1733 * For, Gfx11+, scratch space allocation is based on the number of threads
1734 * in the base configuration.
1735 *
1736 * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
1737 * we wish to view that there are 4 subslices per slice instead of the
1738 * actual number of subslices per slice. The documentation for 3DSTATE_PS
1739 * "Scratch Space Base Pointer" says:
1740 *
1741 * "Scratch Space per slice is computed based on 4 sub-slices. SW
1742 * must allocate scratch space enough so that each slice has 4
1743 * slices allowed."
1744 *
1745 * According to the other driver team, this applies to compute shaders
1746 * as well. This is not currently documented at all.
1747 *
1748 * For Gfx8 and older we user devinfo->subslice_total.
1749 */
1750 unsigned subslices;
1751 if (devinfo->verx10 == 125)
1752 subslices = 32;
1753 else if (devinfo->ver == 12)
1754 subslices = (devinfo->platform == INTEL_PLATFORM_DG1 || devinfo->gt == 2 ? 6 : 2);
1755 else if (devinfo->ver == 11)
1756 subslices = 8;
1757 else if (devinfo->ver >= 9 && devinfo->ver < 11)
1758 subslices = 4 * devinfo->num_slices;
1759 else
1760 subslices = devinfo->subslice_total;
1761 assert(subslices >= devinfo->subslice_total);
1762
1763 unsigned scratch_ids_per_subslice;
1764 if (devinfo->ver >= 12) {
1765 /* Same as ICL below, but with 16 EUs. */
1766 scratch_ids_per_subslice = 16 * 8;
1767 } else if (devinfo->ver >= 11) {
1768 /* The MEDIA_VFE_STATE docs say:
1769 *
1770 * "Starting with this configuration, the Maximum Number of
1771 * Threads must be set to (#EU * 8) for GPGPU dispatches.
1772 *
1773 * Although there are only 7 threads per EU in the configuration,
1774 * the FFTID is calculated as if there are 8 threads per EU,
1775 * which in turn requires a larger amount of Scratch Space to be
1776 * allocated by the driver."
1777 */
1778 scratch_ids_per_subslice = 8 * 8;
1779 } else if (devinfo->platform == INTEL_PLATFORM_HSW) {
1780 /* WaCSScratchSize:hsw
1781 *
1782 * Haswell's scratch space address calculation appears to be sparse
1783 * rather than tightly packed. The Thread ID has bits indicating
1784 * which subslice, EU within a subslice, and thread within an EU it
1785 * is. There's a maximum of two slices and two subslices, so these
1786 * can be stored with a single bit. Even though there are only 10 EUs
1787 * per subslice, this is stored in 4 bits, so there's an effective
1788 * maximum value of 16 EUs. Similarly, although there are only 7
1789 * threads per EU, this is stored in a 3 bit number, giving an
1790 * effective maximum value of 8 threads per EU.
1791 *
1792 * This means that we need to use 16 * 8 instead of 10 * 7 for the
1793 * number of threads per subslice.
1794 */
1795 scratch_ids_per_subslice = 16 * 8;
1796 } else if (devinfo->platform == INTEL_PLATFORM_CHV) {
1797 /* Cherryview devices have either 6 or 8 EUs per subslice, and each
1798 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
1799 * as if it had 8 EUs.
1800 */
1801 scratch_ids_per_subslice = 8 * 7;
1802 } else {
1803 scratch_ids_per_subslice = devinfo->max_cs_threads;
1804 }
1805
1806 unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
1807
1808 if (devinfo->verx10 >= 125) {
1809 /* On GFX version 12.5, scratch access changed to a surface-based model.
1810 * Instead of each shader type having its own layout based on IDs passed
1811 * from the relevant fixed-function unit, all scratch access is based on
1812 * thread IDs like it always has been for compute.
1813 */
1814 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
1815 devinfo->max_scratch_ids[i] = max_thread_ids;
1816 } else {
1817 unsigned max_scratch_ids[] = {
1818 [MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
1819 [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
1820 [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
1821 [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
1822 [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
1823 [MESA_SHADER_COMPUTE] = max_thread_ids,
1824 };
1825 STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
1826 memcpy(devinfo->max_scratch_ids, max_scratch_ids,
1827 sizeof(devinfo->max_scratch_ids));
1828 }
1829 }
1830
1831 static unsigned
intel_device_info_calc_engine_prefetch(const struct intel_device_info * devinfo,enum intel_engine_class engine_class)1832 intel_device_info_calc_engine_prefetch(const struct intel_device_info *devinfo,
1833 enum intel_engine_class engine_class)
1834 {
1835 if (devinfo->verx10 >= 200) {
1836 switch (engine_class) {
1837 case INTEL_ENGINE_CLASS_RENDER:
1838 return 4096;
1839 case INTEL_ENGINE_CLASS_COMPUTE:
1840 return 1024;
1841 default:
1842 return 512;
1843 }
1844 }
1845
1846 if (intel_device_info_is_mtl_or_arl(devinfo)) {
1847 switch (engine_class) {
1848 case INTEL_ENGINE_CLASS_RENDER:
1849 return 2048;
1850 case INTEL_ENGINE_CLASS_COMPUTE:
1851 return 1024;
1852 default:
1853 return 512;
1854 }
1855 }
1856
1857 /* DG2 */
1858 if (devinfo->verx10 == 125)
1859 return 1024;
1860
1861 /* Older than DG2/MTL */
1862 return 512;
1863 }
1864
1865 bool
intel_get_device_info_from_fd(int fd,struct intel_device_info * devinfo,int min_ver,int max_ver)1866 intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo, int min_ver, int max_ver)
1867 {
1868 if (NULL != getenv("INTEL_STUB_GPU_JSON")) {
1869 /* This call will succeed when shim-drm has been initialized with a
1870 * serialized intel_device_info structure.
1871 */
1872 struct drm_intel_stub_devinfo arg = {
1873 .addr = (uintptr_t)devinfo,
1874 .size = sizeof(*devinfo),
1875 };
1876 if (0 == intel_ioctl(fd, DRM_IOCTL_INTEL_STUB_DEVINFO, &arg)) {
1877 intel_device_info_init_was(devinfo);
1878 intel_device_info_apply_workarounds(devinfo);
1879 return true;
1880 }
1881 }
1882
1883 /* Get PCI info.
1884 *
1885 * Some callers may already have a valid drm device which holds values of
1886 * PCI fields queried here prior to calling this function. But making this
1887 * query optional leads to a more cumbersome implementation. These callers
1888 * still need to initialize the fields somewhere out of this function and
1889 * rely on an ioctl to get PCI device id for the next step when skipping
1890 * this drm query.
1891 */
1892 drmDevicePtr drmdev = NULL;
1893 if (drmGetDevice2(fd, DRM_DEVICE_GET_PCI_REVISION, &drmdev)) {
1894 mesa_loge("Failed to query drm device.");
1895 return false;
1896 }
1897 if (!intel_device_info_init_common(drmdev->deviceinfo.pci->device_id,
1898 false, devinfo)) {
1899 drmFreeDevice(&drmdev);
1900 return false;
1901 }
1902
1903 if ((min_ver > 0 && devinfo->ver < min_ver) || (max_ver > 0 && devinfo->ver > max_ver)) {
1904 drmFreeDevice(&drmdev);
1905 return false;
1906 }
1907
1908 devinfo->pci_domain = drmdev->businfo.pci->domain;
1909 devinfo->pci_bus = drmdev->businfo.pci->bus;
1910 devinfo->pci_dev = drmdev->businfo.pci->dev;
1911 devinfo->pci_func = drmdev->businfo.pci->func;
1912 devinfo->pci_device_id = drmdev->deviceinfo.pci->device_id;
1913 devinfo->pci_revision_id = drmdev->deviceinfo.pci->revision_id;
1914 drmFreeDevice(&drmdev);
1915 devinfo->no_hw = debug_get_bool_option("INTEL_NO_HW", false);
1916
1917 devinfo->kmd_type = intel_get_kmd_type(fd);
1918 if (devinfo->kmd_type == INTEL_KMD_TYPE_INVALID) {
1919 mesa_loge("Unknown kernel mode driver");
1920 return false;
1921 }
1922
1923 /* remaining initialization queries the kernel for device info */
1924 if (devinfo->no_hw) {
1925 /* Provide some sensible values for NO_HW. */
1926 devinfo->gtt_size =
1927 devinfo->ver >= 8 ? (1ull << 48) : 2ull * 1024 * 1024 * 1024;
1928 intel_device_info_compute_system_memory(devinfo, false);
1929 return true;
1930 }
1931
1932 bool ret;
1933 switch (devinfo->kmd_type) {
1934 case INTEL_KMD_TYPE_I915:
1935 ret = intel_device_info_i915_get_info_from_fd(fd, devinfo);
1936 break;
1937 case INTEL_KMD_TYPE_XE:
1938 ret = intel_device_info_xe_get_info_from_fd(fd, devinfo);
1939 if (devinfo->verx10 < 200)
1940 mesa_logw("Support for this platform is experimental with Xe KMD, bug reports may be ignored.");
1941 break;
1942 default:
1943 ret = false;
1944 unreachable("Missing");
1945 }
1946 if (!ret) {
1947 mesa_logw("Could not get intel_device_info.");
1948 return false;
1949 }
1950
1951 /* region info is required for lmem support */
1952 if (devinfo->has_local_mem && !devinfo->mem.use_class_instance) {
1953 mesa_logw("Could not query local memory size.");
1954 return false;
1955 }
1956
1957 intel_device_info_adjust_memory(devinfo);
1958
1959 /* Gfx7 and older do not support EU/Subslice info */
1960 assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
1961 devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
1962
1963 init_max_scratch_ids(devinfo);
1964
1965 for (enum intel_engine_class engine = INTEL_ENGINE_CLASS_RENDER;
1966 engine < ARRAY_SIZE(devinfo->engine_class_prefetch); engine++)
1967 devinfo->engine_class_prefetch[engine] =
1968 intel_device_info_calc_engine_prefetch(devinfo, engine);
1969
1970 intel_device_info_init_was(devinfo);
1971 intel_device_info_apply_workarounds(devinfo);
1972
1973 intel_check_hwconfig_items(fd, devinfo);
1974
1975 return true;
1976 }
1977
intel_device_info_update_memory_info(struct intel_device_info * devinfo,int fd)1978 bool intel_device_info_update_memory_info(struct intel_device_info *devinfo, int fd)
1979 {
1980 bool ret;
1981
1982 switch (devinfo->kmd_type) {
1983 case INTEL_KMD_TYPE_I915:
1984 ret = intel_device_info_i915_query_regions(devinfo, fd, true);
1985 break;
1986 case INTEL_KMD_TYPE_XE:
1987 ret = intel_device_info_xe_query_regions(fd, devinfo, true);
1988 break;
1989 default:
1990 ret = false;
1991 }
1992
1993 if (ret)
1994 intel_device_info_adjust_memory(devinfo);
1995 return ret;
1996 }
1997
1998 void
intel_device_info_update_after_hwconfig(struct intel_device_info * devinfo)1999 intel_device_info_update_after_hwconfig(struct intel_device_info *devinfo)
2000 {
2001 /* After applying hwconfig values, some items need to be recalculated. */
2002 devinfo->max_cs_threads =
2003 devinfo->max_eus_per_subslice * devinfo->num_thread_per_eu;
2004
2005 intel_device_info_update_cs_workgroup_threads(devinfo);
2006 }
2007
2008 enum intel_wa_steppings
intel_device_info_wa_stepping(struct intel_device_info * devinfo)2009 intel_device_info_wa_stepping(struct intel_device_info *devinfo)
2010 {
2011 /* When adding platforms to this function, check to see if
2012 * stepping-specific workarounds impact the compiler.
2013 *
2014 * If a stepping specific compiler workaround is required on a released
2015 * platform, intel_device_info->revision must be added as a
2016 * 'compiler_field' in intel_device_info.py
2017 */
2018
2019 if (devinfo->platform == INTEL_PLATFORM_BMG) {
2020 switch (devinfo->revision) {
2021 case 0:
2022 return INTEL_STEPPING_A0;
2023 case 1:
2024 return INTEL_STEPPING_A1;
2025 case 4:
2026 return INTEL_STEPPING_B0;
2027 default:
2028 return INTEL_STEPPING_RELEASE;
2029 }
2030 } else if (devinfo->platform == INTEL_PLATFORM_LNL) {
2031 switch (devinfo->revision) {
2032 case 0:
2033 return INTEL_STEPPING_A0;
2034 case 1:
2035 return INTEL_STEPPING_A1;
2036 case 4:
2037 return INTEL_STEPPING_B0;
2038 default:
2039 return INTEL_STEPPING_RELEASE;
2040 }
2041 } else if (devinfo->platform == INTEL_PLATFORM_TGL) {
2042 /* TGL production steppings: B0 and C0 */
2043 switch (devinfo->revision) {
2044 case 1:
2045 return INTEL_STEPPING_B0;
2046 case 3:
2047 return INTEL_STEPPING_C0;
2048 default:
2049 return INTEL_STEPPING_RELEASE;
2050 }
2051 }
2052
2053 /* all other platforms support only released steppings */
2054 return INTEL_STEPPING_RELEASE;
2055 }
2056
2057 uint32_t
intel_device_info_get_max_slm_size(const struct intel_device_info * devinfo)2058 intel_device_info_get_max_slm_size(const struct intel_device_info *devinfo)
2059 {
2060 uint32_t bytes = 0;
2061
2062 if (devinfo->verx10 >= 300) {
2063 bytes = 128 * 1024;
2064 } else if (devinfo->verx10 >= 200) {
2065 bytes = intel_device_info_get_max_preferred_slm_size(devinfo);
2066 } else {
2067 bytes = 64 * 1024;
2068 }
2069
2070 return bytes;
2071 }
2072
2073 uint32_t
intel_device_info_get_max_preferred_slm_size(const struct intel_device_info * devinfo)2074 intel_device_info_get_max_preferred_slm_size(const struct intel_device_info *devinfo)
2075 {
2076 uint32_t k_bytes = 0;
2077
2078 if (devinfo->verx10 >= 300) {
2079 k_bytes = 192;
2080 } else if (devinfo->verx10 >= 200) {
2081 if (intel_needs_workaround(devinfo, 16018610683))
2082 k_bytes = 128;
2083 else
2084 k_bytes = 160;
2085 } else {
2086 k_bytes = 128;
2087 }
2088
2089 return k_bytes * 1024;
2090 }
2091