1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30
31 #include <xf86drm.h>
32
33 #include "intel_device_info.h"
34 #include "intel_hwconfig.h"
35 #include "intel/common/intel_gem.h"
36 #include "util/bitscan.h"
37 #include "util/debug.h"
38 #include "util/log.h"
39 #include "util/macros.h"
40 #include "util/os_misc.h"
41
42 #include "drm-uapi/i915_drm.h"
43
44 static const struct {
45 const char *name;
46 int pci_id;
47 } name_map[] = {
48 { "lpt", 0x27a2 },
49 { "brw", 0x2a02 },
50 { "g4x", 0x2a42 },
51 { "ilk", 0x0042 },
52 { "snb", 0x0126 },
53 { "ivb", 0x016a },
54 { "hsw", 0x0d2e },
55 { "byt", 0x0f33 },
56 { "bdw", 0x162e },
57 { "chv", 0x22B3 },
58 { "skl", 0x1912 },
59 { "bxt", 0x5A85 },
60 { "kbl", 0x5912 },
61 { "aml", 0x591C },
62 { "glk", 0x3185 },
63 { "cfl", 0x3E9B },
64 { "whl", 0x3EA1 },
65 { "cml", 0x9b41 },
66 { "icl", 0x8a52 },
67 { "ehl", 0x4500 },
68 { "jsl", 0x4E71 },
69 { "tgl", 0x9a49 },
70 { "rkl", 0x4c8a },
71 { "dg1", 0x4905 },
72 { "adl", 0x4680 },
73 { "sg1", 0x4907 },
74 { "rpl", 0xa780 },
75 { "dg2", 0x5690 },
76 };
77
78 /**
79 * Get the PCI ID for the device name.
80 *
81 * Returns -1 if the device is not known.
82 */
83 int
intel_device_name_to_pci_device_id(const char * name)84 intel_device_name_to_pci_device_id(const char *name)
85 {
86 for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
87 if (!strcmp(name_map[i].name, name))
88 return name_map[i].pci_id;
89 }
90
91 return -1;
92 }
93
94 static const struct intel_device_info intel_device_info_gfx3 = {
95 .ver = 3,
96 .platform = INTEL_PLATFORM_GFX3,
97 .simulator_id = -1,
98 .num_slices = 1,
99 .num_subslices = { 1, },
100 .max_eus_per_subslice = 8,
101 .num_thread_per_eu = 4,
102 .timestamp_frequency = 12500000,
103 .cs_prefetch_size = 512,
104 };
105
106 static const struct intel_device_info intel_device_info_i965 = {
107 .ver = 4,
108 .platform = INTEL_PLATFORM_I965,
109 .has_negative_rhw_bug = true,
110 .num_slices = 1,
111 .num_subslices = { 1, },
112 .max_eus_per_subslice = 8,
113 .num_thread_per_eu = 4,
114 .max_vs_threads = 16,
115 .max_gs_threads = 2,
116 .max_wm_threads = 8 * 4,
117 .urb = {
118 .size = 256,
119 },
120 .timestamp_frequency = 12500000,
121 .simulator_id = -1,
122 .cs_prefetch_size = 512,
123 };
124
125 static const struct intel_device_info intel_device_info_g4x = {
126 .ver = 4,
127 .verx10 = 45,
128 .has_pln = true,
129 .has_compr4 = true,
130 .has_surface_tile_offset = true,
131 .platform = INTEL_PLATFORM_G4X,
132 .num_slices = 1,
133 .num_subslices = { 1, },
134 .max_eus_per_subslice = 10,
135 .num_thread_per_eu = 5,
136 .max_vs_threads = 32,
137 .max_gs_threads = 2,
138 .max_wm_threads = 10 * 5,
139 .urb = {
140 .size = 384,
141 },
142 .timestamp_frequency = 12500000,
143 .simulator_id = -1,
144 .cs_prefetch_size = 512,
145 };
146
147 static const struct intel_device_info intel_device_info_ilk = {
148 .ver = 5,
149 .platform = INTEL_PLATFORM_ILK,
150 .has_pln = true,
151 .has_compr4 = true,
152 .has_surface_tile_offset = true,
153 .num_slices = 1,
154 .num_subslices = { 1, },
155 .max_eus_per_subslice = 12,
156 .num_thread_per_eu = 6,
157 .max_vs_threads = 72,
158 .max_gs_threads = 32,
159 .max_wm_threads = 12 * 6,
160 .urb = {
161 .size = 1024,
162 },
163 .timestamp_frequency = 12500000,
164 .simulator_id = -1,
165 .cs_prefetch_size = 512,
166 };
167
168 static const struct intel_device_info intel_device_info_snb_gt1 = {
169 .ver = 6,
170 .gt = 1,
171 .platform = INTEL_PLATFORM_SNB,
172 .has_hiz_and_separate_stencil = true,
173 .has_llc = true,
174 .has_pln = true,
175 .has_surface_tile_offset = true,
176 .needs_unlit_centroid_workaround = true,
177 .num_slices = 1,
178 .num_subslices = { 1, },
179 .max_eus_per_subslice = 6,
180 .num_thread_per_eu = 6, /* Not confirmed */
181 .max_vs_threads = 24,
182 .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
183 .max_wm_threads = 40,
184 .urb = {
185 .size = 32,
186 .min_entries = {
187 [MESA_SHADER_VERTEX] = 24,
188 },
189 .max_entries = {
190 [MESA_SHADER_VERTEX] = 256,
191 [MESA_SHADER_GEOMETRY] = 256,
192 },
193 },
194 .timestamp_frequency = 12500000,
195 .simulator_id = -1,
196 .cs_prefetch_size = 512,
197 };
198
199 static const struct intel_device_info intel_device_info_snb_gt2 = {
200 .ver = 6,
201 .gt = 2,
202 .platform = INTEL_PLATFORM_SNB,
203 .has_hiz_and_separate_stencil = true,
204 .has_llc = true,
205 .has_pln = true,
206 .has_surface_tile_offset = true,
207 .needs_unlit_centroid_workaround = true,
208 .num_slices = 1,
209 .num_subslices = { 1, },
210 .max_eus_per_subslice = 12,
211 .num_thread_per_eu = 6, /* Not confirmed */
212 .max_vs_threads = 60,
213 .max_gs_threads = 60,
214 .max_wm_threads = 80,
215 .urb = {
216 .size = 64,
217 .min_entries = {
218 [MESA_SHADER_VERTEX] = 24,
219 },
220 .max_entries = {
221 [MESA_SHADER_VERTEX] = 256,
222 [MESA_SHADER_GEOMETRY] = 256,
223 },
224 },
225 .timestamp_frequency = 12500000,
226 .simulator_id = -1,
227 .cs_prefetch_size = 512,
228 };
229
230 #define GFX7_FEATURES \
231 .ver = 7, \
232 .has_hiz_and_separate_stencil = true, \
233 .must_use_separate_stencil = true, \
234 .has_llc = true, \
235 .has_pln = true, \
236 .has_64bit_float = true, \
237 .has_surface_tile_offset = true, \
238 .timestamp_frequency = 12500000, \
239 .max_constant_urb_size_kb = 16, \
240 .cs_prefetch_size = 512
241
242 static const struct intel_device_info intel_device_info_ivb_gt1 = {
243 GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 1,
244 .num_slices = 1,
245 .num_subslices = { 1, },
246 .max_eus_per_subslice = 6,
247 .num_thread_per_eu = 6,
248 .l3_banks = 2,
249 .max_vs_threads = 36,
250 .max_tcs_threads = 36,
251 .max_tes_threads = 36,
252 .max_gs_threads = 36,
253 .max_wm_threads = 48,
254 .max_cs_threads = 36,
255 .urb = {
256 .min_entries = {
257 [MESA_SHADER_VERTEX] = 32,
258 [MESA_SHADER_TESS_EVAL] = 10,
259 },
260 .max_entries = {
261 [MESA_SHADER_VERTEX] = 512,
262 [MESA_SHADER_TESS_CTRL] = 32,
263 [MESA_SHADER_TESS_EVAL] = 288,
264 [MESA_SHADER_GEOMETRY] = 192,
265 },
266 },
267 .simulator_id = 7,
268 };
269
270 static const struct intel_device_info intel_device_info_ivb_gt2 = {
271 GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 2,
272 .num_slices = 1,
273 .num_subslices = { 1, },
274 .max_eus_per_subslice = 12,
275 .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
276 * @max_wm_threads ... */
277 .l3_banks = 4,
278 .max_vs_threads = 128,
279 .max_tcs_threads = 128,
280 .max_tes_threads = 128,
281 .max_gs_threads = 128,
282 .max_wm_threads = 172,
283 .max_cs_threads = 64,
284 .urb = {
285 .min_entries = {
286 [MESA_SHADER_VERTEX] = 32,
287 [MESA_SHADER_TESS_EVAL] = 10,
288 },
289 .max_entries = {
290 [MESA_SHADER_VERTEX] = 704,
291 [MESA_SHADER_TESS_CTRL] = 64,
292 [MESA_SHADER_TESS_EVAL] = 448,
293 [MESA_SHADER_GEOMETRY] = 320,
294 },
295 },
296 .simulator_id = 7,
297 };
298
299 static const struct intel_device_info intel_device_info_byt = {
300 GFX7_FEATURES, .platform = INTEL_PLATFORM_BYT, .gt = 1,
301 .num_slices = 1,
302 .num_subslices = { 1, },
303 .max_eus_per_subslice = 4,
304 .num_thread_per_eu = 8,
305 .l3_banks = 1,
306 .has_llc = false,
307 .max_vs_threads = 36,
308 .max_tcs_threads = 36,
309 .max_tes_threads = 36,
310 .max_gs_threads = 36,
311 .max_wm_threads = 48,
312 .max_cs_threads = 32,
313 .urb = {
314 .min_entries = {
315 [MESA_SHADER_VERTEX] = 32,
316 [MESA_SHADER_TESS_EVAL] = 10,
317 },
318 .max_entries = {
319 [MESA_SHADER_VERTEX] = 512,
320 [MESA_SHADER_TESS_CTRL] = 32,
321 [MESA_SHADER_TESS_EVAL] = 288,
322 [MESA_SHADER_GEOMETRY] = 192,
323 },
324 },
325 .simulator_id = 10,
326 };
327
328 #define HSW_FEATURES \
329 GFX7_FEATURES, \
330 .platform = INTEL_PLATFORM_HSW, \
331 .verx10 = 75, \
332 .supports_simd16_3src = true
333
334 static const struct intel_device_info intel_device_info_hsw_gt1 = {
335 HSW_FEATURES, .gt = 1,
336 .num_slices = 1,
337 .num_subslices = { 1, },
338 .max_eus_per_subslice = 10,
339 .num_thread_per_eu = 7,
340 .l3_banks = 2,
341 .max_vs_threads = 70,
342 .max_tcs_threads = 70,
343 .max_tes_threads = 70,
344 .max_gs_threads = 70,
345 .max_wm_threads = 102,
346 .max_cs_threads = 70,
347 .urb = {
348 .min_entries = {
349 [MESA_SHADER_VERTEX] = 32,
350 [MESA_SHADER_TESS_EVAL] = 10,
351 },
352 .max_entries = {
353 [MESA_SHADER_VERTEX] = 640,
354 [MESA_SHADER_TESS_CTRL] = 64,
355 [MESA_SHADER_TESS_EVAL] = 384,
356 [MESA_SHADER_GEOMETRY] = 256,
357 },
358 },
359 .simulator_id = 9,
360 };
361
362 static const struct intel_device_info intel_device_info_hsw_gt2 = {
363 HSW_FEATURES, .gt = 2,
364 .num_slices = 1,
365 .num_subslices = { 2, },
366 .max_eus_per_subslice = 10,
367 .num_thread_per_eu = 7,
368 .l3_banks = 4,
369 .max_vs_threads = 280,
370 .max_tcs_threads = 256,
371 .max_tes_threads = 280,
372 .max_gs_threads = 256,
373 .max_wm_threads = 204,
374 .max_cs_threads = 70,
375 .urb = {
376 .min_entries = {
377 [MESA_SHADER_VERTEX] = 64,
378 [MESA_SHADER_TESS_EVAL] = 10,
379 },
380 .max_entries = {
381 [MESA_SHADER_VERTEX] = 1664,
382 [MESA_SHADER_TESS_CTRL] = 128,
383 [MESA_SHADER_TESS_EVAL] = 960,
384 [MESA_SHADER_GEOMETRY] = 640,
385 },
386 },
387 .simulator_id = 9,
388 };
389
390 static const struct intel_device_info intel_device_info_hsw_gt3 = {
391 HSW_FEATURES, .gt = 3,
392 .num_slices = 2,
393 .num_subslices = { 2, 2, },
394 .max_eus_per_subslice = 10,
395 .num_thread_per_eu = 7,
396 .l3_banks = 8,
397 .max_vs_threads = 280,
398 .max_tcs_threads = 256,
399 .max_tes_threads = 280,
400 .max_gs_threads = 256,
401 .max_wm_threads = 408,
402 .max_cs_threads = 70,
403 .urb = {
404 .min_entries = {
405 [MESA_SHADER_VERTEX] = 64,
406 [MESA_SHADER_TESS_EVAL] = 10,
407 },
408 .max_entries = {
409 [MESA_SHADER_VERTEX] = 1664,
410 [MESA_SHADER_TESS_CTRL] = 128,
411 [MESA_SHADER_TESS_EVAL] = 960,
412 [MESA_SHADER_GEOMETRY] = 640,
413 },
414 },
415 .max_constant_urb_size_kb = 32,
416 .simulator_id = 9,
417 };
418
419 /* It's unclear how well supported sampling from the hiz buffer is on GFX8,
420 * so keep things conservative for now and set has_sample_with_hiz = false.
421 */
422 #define GFX8_FEATURES \
423 .ver = 8, \
424 .has_hiz_and_separate_stencil = true, \
425 .must_use_separate_stencil = true, \
426 .has_llc = true, \
427 .has_sample_with_hiz = false, \
428 .has_pln = true, \
429 .has_integer_dword_mul = true, \
430 .has_64bit_float = true, \
431 .has_64bit_int = true, \
432 .supports_simd16_3src = true, \
433 .has_surface_tile_offset = true, \
434 .num_thread_per_eu = 7, \
435 .max_vs_threads = 504, \
436 .max_tcs_threads = 504, \
437 .max_tes_threads = 504, \
438 .max_gs_threads = 504, \
439 .max_wm_threads = 384, \
440 .max_threads_per_psd = 64, \
441 .timestamp_frequency = 12500000, \
442 .max_constant_urb_size_kb = 32, \
443 .cs_prefetch_size = 512
444
445 static const struct intel_device_info intel_device_info_bdw_gt1 = {
446 GFX8_FEATURES, .gt = 1,
447 .platform = INTEL_PLATFORM_BDW,
448 .num_slices = 1,
449 .num_subslices = { 2, },
450 .max_eus_per_subslice = 6,
451 .l3_banks = 2,
452 .max_cs_threads = 42,
453 .urb = {
454 .min_entries = {
455 [MESA_SHADER_VERTEX] = 64,
456 [MESA_SHADER_TESS_EVAL] = 34,
457 },
458 .max_entries = {
459 [MESA_SHADER_VERTEX] = 2560,
460 [MESA_SHADER_TESS_CTRL] = 504,
461 [MESA_SHADER_TESS_EVAL] = 1536,
462 /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */
463 [MESA_SHADER_GEOMETRY] = 690,
464 },
465 },
466 .simulator_id = 11,
467 };
468
469 static const struct intel_device_info intel_device_info_bdw_gt2 = {
470 GFX8_FEATURES, .gt = 2,
471 .platform = INTEL_PLATFORM_BDW,
472 .num_slices = 1,
473 .num_subslices = { 3, },
474 .max_eus_per_subslice = 8,
475 .l3_banks = 4,
476 .max_cs_threads = 56,
477 .urb = {
478 .min_entries = {
479 [MESA_SHADER_VERTEX] = 64,
480 [MESA_SHADER_TESS_EVAL] = 34,
481 },
482 .max_entries = {
483 [MESA_SHADER_VERTEX] = 2560,
484 [MESA_SHADER_TESS_CTRL] = 504,
485 [MESA_SHADER_TESS_EVAL] = 1536,
486 [MESA_SHADER_GEOMETRY] = 960,
487 },
488 },
489 .simulator_id = 11,
490 };
491
492 static const struct intel_device_info intel_device_info_bdw_gt3 = {
493 GFX8_FEATURES, .gt = 3,
494 .platform = INTEL_PLATFORM_BDW,
495 .num_slices = 2,
496 .num_subslices = { 3, 3, },
497 .max_eus_per_subslice = 8,
498 .l3_banks = 8,
499 .max_cs_threads = 56,
500 .urb = {
501 .min_entries = {
502 [MESA_SHADER_VERTEX] = 64,
503 [MESA_SHADER_TESS_EVAL] = 34,
504 },
505 .max_entries = {
506 [MESA_SHADER_VERTEX] = 2560,
507 [MESA_SHADER_TESS_CTRL] = 504,
508 [MESA_SHADER_TESS_EVAL] = 1536,
509 [MESA_SHADER_GEOMETRY] = 960,
510 },
511 },
512 .simulator_id = 11,
513 };
514
515 static const struct intel_device_info intel_device_info_chv = {
516 GFX8_FEATURES, .platform = INTEL_PLATFORM_CHV, .gt = 1,
517 .has_llc = false,
518 .has_integer_dword_mul = false,
519 .num_slices = 1,
520 .num_subslices = { 2, },
521 .max_eus_per_subslice = 8,
522 .l3_banks = 2,
523 .max_vs_threads = 80,
524 .max_tcs_threads = 80,
525 .max_tes_threads = 80,
526 .max_gs_threads = 80,
527 .max_wm_threads = 128,
528 .max_cs_threads = 6 * 7,
529 .urb = {
530 .min_entries = {
531 [MESA_SHADER_VERTEX] = 34,
532 [MESA_SHADER_TESS_EVAL] = 34,
533 },
534 .max_entries = {
535 [MESA_SHADER_VERTEX] = 640,
536 [MESA_SHADER_TESS_CTRL] = 80,
537 [MESA_SHADER_TESS_EVAL] = 384,
538 [MESA_SHADER_GEOMETRY] = 256,
539 },
540 },
541 .simulator_id = 13,
542 };
543
544 #define GFX9_HW_INFO \
545 .ver = 9, \
546 .max_vs_threads = 336, \
547 .max_gs_threads = 336, \
548 .max_tcs_threads = 336, \
549 .max_tes_threads = 336, \
550 .max_threads_per_psd = 64, \
551 .max_cs_threads = 56, \
552 .timestamp_frequency = 12000000, \
553 .cs_prefetch_size = 512, \
554 .urb = { \
555 .min_entries = { \
556 [MESA_SHADER_VERTEX] = 64, \
557 [MESA_SHADER_TESS_EVAL] = 34, \
558 }, \
559 .max_entries = { \
560 [MESA_SHADER_VERTEX] = 1856, \
561 [MESA_SHADER_TESS_CTRL] = 672, \
562 [MESA_SHADER_TESS_EVAL] = 1120, \
563 [MESA_SHADER_GEOMETRY] = 640, \
564 }, \
565 }
566
567 #define GFX9_LP_FEATURES \
568 GFX8_FEATURES, \
569 GFX9_HW_INFO, \
570 .has_integer_dword_mul = false, \
571 .gt = 1, \
572 .has_llc = false, \
573 .has_sample_with_hiz = true, \
574 .num_slices = 1, \
575 .num_thread_per_eu = 6, \
576 .max_vs_threads = 112, \
577 .max_tcs_threads = 112, \
578 .max_tes_threads = 112, \
579 .max_gs_threads = 112, \
580 .max_cs_threads = 6 * 6, \
581 .timestamp_frequency = 19200000, \
582 .urb = { \
583 .min_entries = { \
584 [MESA_SHADER_VERTEX] = 34, \
585 [MESA_SHADER_TESS_EVAL] = 34, \
586 }, \
587 .max_entries = { \
588 [MESA_SHADER_VERTEX] = 704, \
589 [MESA_SHADER_TESS_CTRL] = 256, \
590 [MESA_SHADER_TESS_EVAL] = 416, \
591 [MESA_SHADER_GEOMETRY] = 256, \
592 }, \
593 }
594
595 #define GFX9_LP_FEATURES_3X6 \
596 GFX9_LP_FEATURES, \
597 .num_subslices = { 3, }, \
598 .max_eus_per_subslice = 6
599
600 #define GFX9_LP_FEATURES_2X6 \
601 GFX9_LP_FEATURES, \
602 .num_subslices = { 2, }, \
603 .max_eus_per_subslice = 6, \
604 .max_vs_threads = 56, \
605 .max_tcs_threads = 56, \
606 .max_tes_threads = 56, \
607 .max_gs_threads = 56, \
608 .max_cs_threads = 6 * 6, \
609 .urb = { \
610 .min_entries = { \
611 [MESA_SHADER_VERTEX] = 34, \
612 [MESA_SHADER_TESS_EVAL] = 34, \
613 }, \
614 .max_entries = { \
615 [MESA_SHADER_VERTEX] = 352, \
616 [MESA_SHADER_TESS_CTRL] = 128, \
617 [MESA_SHADER_TESS_EVAL] = 208, \
618 [MESA_SHADER_GEOMETRY] = 128, \
619 }, \
620 }
621
622 #define GFX9_FEATURES \
623 GFX8_FEATURES, \
624 GFX9_HW_INFO, \
625 .has_sample_with_hiz = true
626
627 static const struct intel_device_info intel_device_info_skl_gt1 = {
628 GFX9_FEATURES, .gt = 1,
629 .platform = INTEL_PLATFORM_SKL,
630 .num_slices = 1,
631 .num_subslices = { 2, },
632 .max_eus_per_subslice = 6,
633 .l3_banks = 2,
634 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
635 * leading to some vertices to go missing if we use too much URB.
636 */
637 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
638 .simulator_id = 12,
639 };
640
641 static const struct intel_device_info intel_device_info_skl_gt2 = {
642 GFX9_FEATURES, .gt = 2,
643 .platform = INTEL_PLATFORM_SKL,
644 .num_slices = 1,
645 .num_subslices = { 3, },
646 .max_eus_per_subslice = 8,
647 .l3_banks = 4,
648 .simulator_id = 12,
649 };
650
651 static const struct intel_device_info intel_device_info_skl_gt3 = {
652 GFX9_FEATURES, .gt = 3,
653 .platform = INTEL_PLATFORM_SKL,
654 .num_slices = 2,
655 .num_subslices = { 3, 3, },
656 .max_eus_per_subslice = 8,
657 .l3_banks = 8,
658 .simulator_id = 12,
659 };
660
661 static const struct intel_device_info intel_device_info_skl_gt4 = {
662 GFX9_FEATURES, .gt = 4,
663 .platform = INTEL_PLATFORM_SKL,
664 .num_slices = 3,
665 .num_subslices = { 3, 3, 3, },
666 .max_eus_per_subslice = 8,
667 .l3_banks = 12,
668 /* From the "L3 Allocation and Programming" documentation:
669 *
670 * "URB is limited to 1008KB due to programming restrictions. This is not a
671 * restriction of the L3 implementation, but of the FF and other clients.
672 * Therefore, in a GT4 implementation it is possible for the programmed
673 * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
674 * only 1008KB of this will be used."
675 */
676 .simulator_id = 12,
677 };
678
679 static const struct intel_device_info intel_device_info_bxt = {
680 GFX9_LP_FEATURES_3X6,
681 .platform = INTEL_PLATFORM_BXT,
682 .l3_banks = 2,
683 .simulator_id = 14,
684 };
685
686 static const struct intel_device_info intel_device_info_bxt_2x6 = {
687 GFX9_LP_FEATURES_2X6,
688 .platform = INTEL_PLATFORM_BXT,
689 .l3_banks = 1,
690 .simulator_id = 14,
691 };
692 /*
693 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
694 * There's no KBL entry. Using the default SKL (GFX9) GS entries value.
695 */
696
697 static const struct intel_device_info intel_device_info_kbl_gt1 = {
698 GFX9_FEATURES,
699 .platform = INTEL_PLATFORM_KBL,
700 .gt = 1,
701
702 .max_cs_threads = 7 * 6,
703 .num_slices = 1,
704 .num_subslices = { 2, },
705 .max_eus_per_subslice = 6,
706 .l3_banks = 2,
707 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
708 * leading to some vertices to go missing if we use too much URB.
709 */
710 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
711 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
712 .simulator_id = 16,
713 };
714
715 static const struct intel_device_info intel_device_info_kbl_gt1_5 = {
716 GFX9_FEATURES,
717 .platform = INTEL_PLATFORM_KBL,
718 .gt = 1,
719
720 .max_cs_threads = 7 * 6,
721 .num_slices = 1,
722 .num_subslices = { 3, },
723 .max_eus_per_subslice = 6,
724 .l3_banks = 4,
725 .simulator_id = 16,
726 };
727
728 static const struct intel_device_info intel_device_info_kbl_gt2 = {
729 GFX9_FEATURES,
730 .platform = INTEL_PLATFORM_KBL,
731 .gt = 2,
732
733 .num_slices = 1,
734 .num_subslices = { 3, },
735 .max_eus_per_subslice = 8,
736 .l3_banks = 4,
737 .simulator_id = 16,
738 };
739
740 static const struct intel_device_info intel_device_info_kbl_gt3 = {
741 GFX9_FEATURES,
742 .platform = INTEL_PLATFORM_KBL,
743 .gt = 3,
744
745 .num_slices = 2,
746 .num_subslices = { 3, 3, },
747 .max_eus_per_subslice = 8,
748 .l3_banks = 8,
749 .simulator_id = 16,
750 };
751
752 static const struct intel_device_info intel_device_info_kbl_gt4 = {
753 GFX9_FEATURES,
754 .platform = INTEL_PLATFORM_KBL,
755 .gt = 4,
756
757 /*
758 * From the "L3 Allocation and Programming" documentation:
759 *
760 * "URB is limited to 1008KB due to programming restrictions. This
761 * is not a restriction of the L3 implementation, but of the FF and
762 * other clients. Therefore, in a GT4 implementation it is
763 * possible for the programmed allocation of the L3 data array to
764 * provide 3*384KB=1152KB for URB, but only 1008KB of this
765 * will be used."
766 */
767 .num_slices = 3,
768 .num_subslices = { 3, 3, 3, },
769 .max_eus_per_subslice = 8,
770 .l3_banks = 12,
771 .simulator_id = 16,
772 };
773
774 static const struct intel_device_info intel_device_info_glk = {
775 GFX9_LP_FEATURES_3X6,
776 .platform = INTEL_PLATFORM_GLK,
777 .l3_banks = 2,
778 .simulator_id = 17,
779 };
780
781 static const struct intel_device_info intel_device_info_glk_2x6 = {
782 GFX9_LP_FEATURES_2X6,
783 .platform = INTEL_PLATFORM_GLK,
784 .l3_banks = 2,
785 .simulator_id = 17,
786 };
787
788 static const struct intel_device_info intel_device_info_cfl_gt1 = {
789 GFX9_FEATURES,
790 .platform = INTEL_PLATFORM_CFL,
791 .gt = 1,
792
793 .num_slices = 1,
794 .num_subslices = { 2, },
795 .max_eus_per_subslice = 6,
796 .l3_banks = 2,
797 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
798 * leading to some vertices to go missing if we use too much URB.
799 */
800 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
801 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
802 .simulator_id = 24,
803 };
804 static const struct intel_device_info intel_device_info_cfl_gt2 = {
805 GFX9_FEATURES,
806 .platform = INTEL_PLATFORM_CFL,
807 .gt = 2,
808
809 .num_slices = 1,
810 .num_subslices = { 3, },
811 .max_eus_per_subslice = 8,
812 .l3_banks = 4,
813 .simulator_id = 24,
814 };
815
816 static const struct intel_device_info intel_device_info_cfl_gt3 = {
817 GFX9_FEATURES,
818 .platform = INTEL_PLATFORM_CFL,
819 .gt = 3,
820
821 .num_slices = 2,
822 .num_subslices = { 3, 3, },
823 .max_eus_per_subslice = 8,
824 .l3_banks = 8,
825 .simulator_id = 24,
826 };
827
828 #define subslices(args...) { args, }
829
830 #define GFX11_HW_INFO \
831 .ver = 11, \
832 .has_pln = false, \
833 .max_vs_threads = 364, \
834 .max_gs_threads = 224, \
835 .max_tcs_threads = 224, \
836 .max_tes_threads = 364, \
837 .max_threads_per_psd = 64, \
838 .max_cs_threads = 56, \
839 .cs_prefetch_size = 512
840
841 #define GFX11_FEATURES(_gt, _slices, _subslices, _l3, _platform) \
842 GFX8_FEATURES, \
843 GFX11_HW_INFO, \
844 .platform = _platform, \
845 .has_64bit_float = false, \
846 .has_64bit_int = false, \
847 .has_integer_dword_mul = false, \
848 .has_sample_with_hiz = false, \
849 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
850 .num_subslices = _subslices, \
851 .max_eus_per_subslice = 8
852
853 #define GFX11_URB_MIN_MAX_ENTRIES \
854 .min_entries = { \
855 [MESA_SHADER_VERTEX] = 64, \
856 [MESA_SHADER_TESS_EVAL] = 34, \
857 }, \
858 .max_entries = { \
859 [MESA_SHADER_VERTEX] = 2384, \
860 [MESA_SHADER_TESS_CTRL] = 1032, \
861 [MESA_SHADER_TESS_EVAL] = 2384, \
862 [MESA_SHADER_GEOMETRY] = 1032, \
863 }
864
865 static const struct intel_device_info intel_device_info_icl_gt2 = {
866 GFX11_FEATURES(2, 1, subslices(8), 8, INTEL_PLATFORM_ICL),
867 .urb = {
868 GFX11_URB_MIN_MAX_ENTRIES,
869 },
870 .simulator_id = 19,
871 };
872
873 static const struct intel_device_info intel_device_info_icl_gt1_5 = {
874 GFX11_FEATURES(1, 1, subslices(6), 6, INTEL_PLATFORM_ICL),
875 .urb = {
876 GFX11_URB_MIN_MAX_ENTRIES,
877 },
878 .simulator_id = 19,
879 };
880
881 static const struct intel_device_info intel_device_info_icl_gt1 = {
882 GFX11_FEATURES(1, 1, subslices(4), 6, INTEL_PLATFORM_ICL),
883 .urb = {
884 GFX11_URB_MIN_MAX_ENTRIES,
885 },
886 .simulator_id = 19,
887 };
888
889 static const struct intel_device_info intel_device_info_icl_gt0_5 = {
890 GFX11_FEATURES(1, 1, subslices(1), 6, INTEL_PLATFORM_ICL),
891 .urb = {
892 GFX11_URB_MIN_MAX_ENTRIES,
893 },
894 .simulator_id = 19,
895 };
896
897 #define GFX11_LP_FEATURES \
898 .urb = { \
899 GFX11_URB_MIN_MAX_ENTRIES, \
900 }, \
901 .disable_ccs_repack = true, \
902 .simulator_id = 28
903
904 static const struct intel_device_info intel_device_info_ehl_4x8 = {
905 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
906 GFX11_LP_FEATURES,
907 };
908
909 static const struct intel_device_info intel_device_info_ehl_4x6 = {
910 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
911 GFX11_LP_FEATURES,
912 .max_eus_per_subslice = 6,
913 };
914
915 static const struct intel_device_info intel_device_info_ehl_4x5 = {
916 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
917 GFX11_LP_FEATURES,
918 .max_eus_per_subslice = 5,
919 };
920
921 static const struct intel_device_info intel_device_info_ehl_4x4 = {
922 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
923 GFX11_LP_FEATURES,
924 .max_eus_per_subslice = 4,
925 };
926
927 static const struct intel_device_info intel_device_info_ehl_2x8 = {
928 GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
929 GFX11_LP_FEATURES,
930 };
931
932 static const struct intel_device_info intel_device_info_ehl_2x4 = {
933 GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
934 GFX11_LP_FEATURES,
935 .max_eus_per_subslice = 4,
936 };
937
938 #define GFX12_URB_MIN_MAX_ENTRIES \
939 .min_entries = { \
940 [MESA_SHADER_VERTEX] = 64, \
941 [MESA_SHADER_TESS_EVAL] = 34, \
942 }, \
943 .max_entries = { \
944 [MESA_SHADER_VERTEX] = 3576, \
945 [MESA_SHADER_TESS_CTRL] = 1548, \
946 [MESA_SHADER_TESS_EVAL] = 3576, \
947 /* Wa_14013840143 */ \
948 [MESA_SHADER_GEOMETRY] = 1536, \
949 }
950
951 #define GFX12_HW_INFO \
952 .ver = 12, \
953 .has_pln = false, \
954 .has_sample_with_hiz = false, \
955 .has_aux_map = true, \
956 .max_vs_threads = 546, \
957 .max_gs_threads = 336, \
958 .max_tcs_threads = 336, \
959 .max_tes_threads = 546, \
960 .max_threads_per_psd = 64, \
961 .max_cs_threads = 112, /* threads per DSS */ \
962 .urb = { \
963 GFX12_URB_MIN_MAX_ENTRIES, \
964 }
965
966 #define GFX12_FEATURES(_gt, _slices, _l3) \
967 GFX8_FEATURES, \
968 GFX12_HW_INFO, \
969 .has_64bit_float = false, \
970 .has_64bit_int = false, \
971 .has_integer_dword_mul = false, \
972 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
973 .simulator_id = 22, \
974 .max_eus_per_subslice = 16, \
975 .cs_prefetch_size = 512
976
977 #define dual_subslices(args...) { args, }
978
979 #define GFX12_GT05_FEATURES \
980 GFX12_FEATURES(1, 1, 4), \
981 .num_subslices = dual_subslices(1)
982
983 #define GFX12_GT_FEATURES(_gt) \
984 GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8), \
985 .num_subslices = dual_subslices(_gt == 1 ? 2 : 6)
986
987 static const struct intel_device_info intel_device_info_tgl_gt1 = {
988 GFX12_GT_FEATURES(1),
989 .platform = INTEL_PLATFORM_TGL,
990 };
991
992 static const struct intel_device_info intel_device_info_tgl_gt2 = {
993 GFX12_GT_FEATURES(2),
994 .platform = INTEL_PLATFORM_TGL,
995 };
996
997 static const struct intel_device_info intel_device_info_rkl_gt05 = {
998 GFX12_GT05_FEATURES,
999 .platform = INTEL_PLATFORM_RKL,
1000 };
1001
1002 static const struct intel_device_info intel_device_info_rkl_gt1 = {
1003 GFX12_GT_FEATURES(1),
1004 .platform = INTEL_PLATFORM_RKL,
1005 };
1006
1007 static const struct intel_device_info intel_device_info_adl_gt05 = {
1008 GFX12_GT05_FEATURES,
1009 .platform = INTEL_PLATFORM_ADL,
1010 .display_ver = 13,
1011 };
1012
1013 static const struct intel_device_info intel_device_info_adl_gt1 = {
1014 GFX12_GT_FEATURES(1),
1015 .platform = INTEL_PLATFORM_ADL,
1016 .display_ver = 13,
1017 };
1018
1019 static const struct intel_device_info intel_device_info_adl_n = {
1020 GFX12_GT_FEATURES(1),
1021 .platform = INTEL_PLATFORM_ADL,
1022 .display_ver = 13,
1023 };
1024
1025 static const struct intel_device_info intel_device_info_adl_gt2 = {
1026 GFX12_GT_FEATURES(2),
1027 .platform = INTEL_PLATFORM_ADL,
1028 .display_ver = 13,
1029 };
1030
1031 static const struct intel_device_info intel_device_info_rpl = {
1032 GFX12_FEATURES(1, 1, 4),
1033 .num_subslices = dual_subslices(2),
1034 .platform = INTEL_PLATFORM_RPL,
1035 .display_ver = 13,
1036 };
1037
1038 static const struct intel_device_info intel_device_info_rpl_p = {
1039 GFX12_GT_FEATURES(2),
1040 .platform = INTEL_PLATFORM_RPL,
1041 .display_ver = 13,
1042 };
1043
1044 #define GFX12_DG1_SG1_FEATURES \
1045 GFX12_GT_FEATURES(2), \
1046 .platform = INTEL_PLATFORM_DG1, \
1047 .has_llc = false, \
1048 .has_local_mem = true, \
1049 .urb.size = 768, \
1050 .simulator_id = 30
1051
1052 static const struct intel_device_info intel_device_info_dg1 = {
1053 GFX12_DG1_SG1_FEATURES,
1054 };
1055
1056 static const struct intel_device_info intel_device_info_sg1 = {
1057 GFX12_DG1_SG1_FEATURES,
1058 };
1059
1060 #define XEHP_FEATURES(_gt, _slices, _l3) \
1061 GFX12_FEATURES(_gt, _slices, _l3), \
1062 .num_thread_per_eu = 8 /* BSpec 44472 */, \
1063 .verx10 = 125, \
1064 .has_llc = false, \
1065 .has_local_mem = true, \
1066 .has_aux_map = false, \
1067 .simulator_id = 29, \
1068 .cs_prefetch_size = 1024
1069
1070 #define DG2_FEATURES \
1071 /* (Sub)slice info comes from the kernel topology info */ \
1072 XEHP_FEATURES(0, 1, 0), \
1073 .display_ver = 13, \
1074 .revision = 4, /* For offline compiler */ \
1075 .num_subslices = dual_subslices(1), \
1076 .has_lsc = true, \
1077 .apply_hwconfig = true, \
1078 .has_coarse_pixel_primitive_and_cb = true, \
1079 .has_mesh_shading = true
1080
1081 static const struct intel_device_info intel_device_info_dg2_g10 = {
1082 DG2_FEATURES,
1083 .platform = INTEL_PLATFORM_DG2_G10,
1084 };
1085
1086 static const struct intel_device_info intel_device_info_dg2_g11 = {
1087 DG2_FEATURES,
1088 .platform = INTEL_PLATFORM_DG2_G11,
1089 };
1090
1091 static const struct intel_device_info intel_device_info_dg2_g12 = {
1092 DG2_FEATURES,
1093 .platform = INTEL_PLATFORM_DG2_G12,
1094 };
1095
1096 static void
reset_masks(struct intel_device_info * devinfo)1097 reset_masks(struct intel_device_info *devinfo)
1098 {
1099 devinfo->subslice_slice_stride = 0;
1100 devinfo->eu_subslice_stride = 0;
1101 devinfo->eu_slice_stride = 0;
1102
1103 devinfo->num_slices = 0;
1104 memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices));
1105
1106 memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks));
1107 memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks));
1108 memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks));
1109 memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices));
1110 }
1111
1112 static void
update_slice_subslice_counts(struct intel_device_info * devinfo)1113 update_slice_subslice_counts(struct intel_device_info *devinfo)
1114 {
1115 devinfo->num_slices = __builtin_popcount(devinfo->slice_masks);
1116 devinfo->subslice_total = 0;
1117 for (int s = 0; s < devinfo->max_slices; s++) {
1118 if (!intel_device_info_slice_available(devinfo, s))
1119 continue;
1120
1121 for (int b = 0; b < devinfo->subslice_slice_stride; b++) {
1122 devinfo->num_subslices[s] +=
1123 __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]);
1124 }
1125 devinfo->subslice_total += devinfo->num_subslices[s];
1126 }
1127 assert(devinfo->num_slices > 0);
1128 assert(devinfo->subslice_total > 0);
1129 }
1130
1131 static void
update_pixel_pipes(struct intel_device_info * devinfo,uint8_t * subslice_masks)1132 update_pixel_pipes(struct intel_device_info *devinfo, uint8_t *subslice_masks)
1133 {
1134 if (devinfo->ver < 11)
1135 return;
1136
1137 /* The kernel only reports one slice on all existing ICL+ platforms, even
1138 * if multiple slices are present. The slice mask is allowed to have the
1139 * accurate value greater than 1 on gfx12.5+ platforms though, in order to
1140 * be tolerant with the behavior of our simulation environment.
1141 */
1142 assert(devinfo->slice_masks == 1 || devinfo->verx10 >= 125);
1143
1144 /* Count the number of subslices on each pixel pipe. Assume that every
1145 * contiguous group of 4 subslices in the mask belong to the same pixel
1146 * pipe. However note that on TGL+ the kernel returns a mask of enabled
1147 * *dual* subslices instead of actual subslices somewhat confusingly, so
1148 * each pixel pipe only takes 2 bits in the mask even though it's still 4
1149 * subslices.
1150 */
1151 const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4;
1152 for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) {
1153 const unsigned offset = p * ppipe_bits;
1154 const unsigned subslice_idx = offset /
1155 devinfo->max_subslices_per_slice * devinfo->subslice_slice_stride;
1156 const unsigned ppipe_mask =
1157 BITFIELD_RANGE(offset % devinfo->max_subslices_per_slice, ppipe_bits);
1158
1159 if (subslice_idx < ARRAY_SIZE(devinfo->subslice_masks))
1160 devinfo->ppipe_subslices[p] =
1161 __builtin_popcount(subslice_masks[subslice_idx] & ppipe_mask);
1162 else
1163 devinfo->ppipe_subslices[p] = 0;
1164 }
1165 }
1166
1167 static void
update_l3_banks(struct intel_device_info * devinfo)1168 update_l3_banks(struct intel_device_info *devinfo)
1169 {
1170 if (devinfo->ver != 12)
1171 return;
1172
1173 if (devinfo->verx10 >= 125) {
1174 if (devinfo->subslice_total > 16) {
1175 assert(devinfo->subslice_total <= 32);
1176 devinfo->l3_banks = 32;
1177 } else if (devinfo->subslice_total > 8) {
1178 devinfo->l3_banks = 16;
1179 } else {
1180 devinfo->l3_banks = 8;
1181 }
1182 } else {
1183 assert(devinfo->num_slices == 1);
1184 if (devinfo->subslice_total >= 6) {
1185 assert(devinfo->subslice_total == 6);
1186 devinfo->l3_banks = 8;
1187 } else if (devinfo->subslice_total > 2) {
1188 devinfo->l3_banks = 6;
1189 } else {
1190 devinfo->l3_banks = 4;
1191 }
1192 }
1193 }
1194
1195 /* At some point in time, some people decided to redefine what topology means,
1196 * from useful HW related information (slice, subslice, etc...), to much less
1197 * useful generic stuff that no one cares about (a single slice with lots of
1198 * subslices). Of course all of this was done without asking the people who
1199 * defined the topology query in the first place, to solve a lack of
1200 * information Gfx10+. This function is here to workaround the fact it's not
1201 * possible to change people's mind even before this stuff goes upstream. Sad
1202 * times...
1203 */
1204 static void
update_from_single_slice_topology(struct intel_device_info * devinfo,const struct drm_i915_query_topology_info * topology,const struct drm_i915_query_topology_info * geom_topology)1205 update_from_single_slice_topology(struct intel_device_info *devinfo,
1206 const struct drm_i915_query_topology_info *topology,
1207 const struct drm_i915_query_topology_info *geom_topology)
1208 {
1209 /* An array of bit masks of the subslices available for 3D
1210 * workloads, analogous to intel_device_info::subslice_masks. This
1211 * may differ from the set of enabled subslices on XeHP+ platforms
1212 * with compute-only subslices.
1213 */
1214 uint8_t geom_subslice_masks[ARRAY_SIZE(devinfo->subslice_masks)] = { 0 };
1215
1216 assert(devinfo->verx10 >= 125);
1217
1218 reset_masks(devinfo);
1219
1220 assert(topology->max_slices == 1);
1221 assert(topology->max_subslices > 0);
1222 assert(topology->max_eus_per_subslice > 0);
1223
1224 /* i915 gives us only one slice so we have to rebuild that out of groups of
1225 * 4 dualsubslices.
1226 */
1227 devinfo->max_subslices_per_slice = 4;
1228 devinfo->max_eus_per_subslice = 16;
1229 devinfo->subslice_slice_stride = 1;
1230 devinfo->eu_slice_stride = DIV_ROUND_UP(16 * 4, 8);
1231 devinfo->eu_subslice_stride = DIV_ROUND_UP(16, 8);
1232
1233 for (uint32_t ss_idx = 0; ss_idx < topology->max_subslices; ss_idx++) {
1234 const uint32_t s = ss_idx / 4;
1235 const uint32_t ss = ss_idx % 4;
1236
1237 /* Determine whether ss_idx is enabled (ss_idx_available) and
1238 * available for 3D workloads (geom_ss_idx_available), which may
1239 * differ on XeHP+ if ss_idx is a compute-only DSS.
1240 */
1241 const bool ss_idx_available =
1242 (topology->data[topology->subslice_offset + ss_idx / 8] >>
1243 (ss_idx % 8)) & 1;
1244 const bool geom_ss_idx_available =
1245 (geom_topology->data[geom_topology->subslice_offset + ss_idx / 8] >>
1246 (ss_idx % 8)) & 1;
1247
1248 if (geom_ss_idx_available) {
1249 assert(ss_idx_available);
1250 geom_subslice_masks[s * devinfo->subslice_slice_stride +
1251 ss / 8] |= 1u << (ss % 8);
1252 }
1253
1254 if (!ss_idx_available)
1255 continue;
1256
1257 devinfo->max_slices = MAX2(devinfo->max_slices, s + 1);
1258 devinfo->slice_masks |= 1u << s;
1259
1260 devinfo->subslice_masks[s * devinfo->subslice_slice_stride +
1261 ss / 8] |= 1u << (ss % 8);
1262
1263 for (uint32_t eu = 0; eu < devinfo->max_eus_per_subslice; eu++) {
1264 const bool eu_available =
1265 (topology->data[topology->eu_offset +
1266 ss_idx * topology->eu_stride +
1267 eu / 8] >> (eu % 8)) & 1;
1268
1269 if (!eu_available)
1270 continue;
1271
1272 devinfo->eu_masks[s * devinfo->eu_slice_stride +
1273 ss * devinfo->eu_subslice_stride +
1274 eu / 8] |= 1u << (eu % 8);
1275 }
1276 }
1277
1278 update_slice_subslice_counts(devinfo);
1279 update_pixel_pipes(devinfo, geom_subslice_masks);
1280 update_l3_banks(devinfo);
1281 }
1282
1283 static void
update_from_topology(struct intel_device_info * devinfo,const struct drm_i915_query_topology_info * topology)1284 update_from_topology(struct intel_device_info *devinfo,
1285 const struct drm_i915_query_topology_info *topology)
1286 {
1287 reset_masks(devinfo);
1288
1289 assert(topology->max_slices > 0);
1290 assert(topology->max_subslices > 0);
1291 assert(topology->max_eus_per_subslice > 0);
1292
1293 devinfo->subslice_slice_stride = topology->subslice_stride;
1294
1295 devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8);
1296 devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride;
1297
1298 assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8));
1299 memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8));
1300 devinfo->max_slices = topology->max_slices;
1301 devinfo->max_subslices_per_slice = topology->max_subslices;
1302 devinfo->max_eus_per_subslice = topology->max_eus_per_subslice;
1303
1304 uint32_t subslice_mask_len =
1305 topology->max_slices * topology->subslice_stride;
1306 assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len);
1307 memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset],
1308 subslice_mask_len);
1309
1310 uint32_t eu_mask_len =
1311 topology->eu_stride * topology->max_subslices * topology->max_slices;
1312 assert(sizeof(devinfo->eu_masks) >= eu_mask_len);
1313 memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len);
1314
1315 /* Now that all the masks are in place, update the counts. */
1316 update_slice_subslice_counts(devinfo);
1317 update_pixel_pipes(devinfo, devinfo->subslice_masks);
1318 update_l3_banks(devinfo);
1319 }
1320
1321 /* Generate detailed mask from the I915_PARAM_SLICE_MASK,
1322 * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam.
1323 */
1324 static bool
update_from_masks(struct intel_device_info * devinfo,uint32_t slice_mask,uint32_t subslice_mask,uint32_t n_eus)1325 update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask,
1326 uint32_t subslice_mask, uint32_t n_eus)
1327 {
1328 struct drm_i915_query_topology_info *topology;
1329
1330 assert((slice_mask & 0xff) == slice_mask);
1331
1332 size_t data_length = 100;
1333
1334 topology = calloc(1, sizeof(*topology) + data_length);
1335 if (!topology)
1336 return false;
1337
1338 topology->max_slices = util_last_bit(slice_mask);
1339 topology->max_subslices = util_last_bit(subslice_mask);
1340
1341 topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8);
1342 topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8);
1343
1344 uint32_t n_subslices = __builtin_popcount(slice_mask) *
1345 __builtin_popcount(subslice_mask);
1346 uint32_t max_eus_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1347 uint32_t eu_mask = (1U << max_eus_per_subslice) - 1;
1348
1349 topology->max_eus_per_subslice = max_eus_per_subslice;
1350 topology->eu_offset = topology->subslice_offset +
1351 topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8);
1352 topology->eu_stride = DIV_ROUND_UP(max_eus_per_subslice, 8);
1353
1354 /* Set slice mask in topology */
1355 for (int b = 0; b < topology->subslice_offset; b++)
1356 topology->data[b] = (slice_mask >> (b * 8)) & 0xff;
1357
1358 for (int s = 0; s < topology->max_slices; s++) {
1359
1360 /* Set subslice mask in topology */
1361 for (int b = 0; b < topology->subslice_stride; b++) {
1362 int subslice_offset = topology->subslice_offset +
1363 s * topology->subslice_stride + b;
1364
1365 topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff;
1366 }
1367
1368 /* Set eu mask in topology */
1369 for (int ss = 0; ss < topology->max_subslices; ss++) {
1370 for (int b = 0; b < topology->eu_stride; b++) {
1371 int eu_offset = topology->eu_offset +
1372 (s * topology->max_subslices + ss) * topology->eu_stride + b;
1373
1374 topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff;
1375 }
1376 }
1377 }
1378
1379 update_from_topology(devinfo, topology);
1380 free(topology);
1381
1382 return true;
1383 }
1384
1385 /* Generate mask from the device data. */
1386 static void
fill_masks(struct intel_device_info * devinfo)1387 fill_masks(struct intel_device_info *devinfo)
1388 {
1389 /* All of our internal device descriptions assign the same number of
1390 * subslices for each slice. Just verify that this is true.
1391 */
1392 for (int s = 1; s < devinfo->num_slices; s++)
1393 assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]);
1394
1395 update_from_masks(devinfo,
1396 (1U << devinfo->num_slices) - 1,
1397 (1U << devinfo->num_subslices[0]) - 1,
1398 devinfo->num_slices * devinfo->num_subslices[0] *
1399 devinfo->max_eus_per_subslice);
1400 }
1401
1402 static bool
getparam(int fd,uint32_t param,int * value)1403 getparam(int fd, uint32_t param, int *value)
1404 {
1405 int tmp;
1406
1407 struct drm_i915_getparam gp = {
1408 .param = param,
1409 .value = &tmp,
1410 };
1411
1412 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
1413 if (ret != 0)
1414 return false;
1415
1416 *value = tmp;
1417 return true;
1418 }
1419
1420 static bool
get_context_param(int fd,uint32_t context,uint32_t param,uint64_t * value)1421 get_context_param(int fd, uint32_t context, uint32_t param, uint64_t *value)
1422 {
1423 struct drm_i915_gem_context_param gp = {
1424 .ctx_id = context,
1425 .param = param,
1426 };
1427
1428 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp);
1429 if (ret != 0)
1430 return false;
1431
1432 *value = gp.value;
1433 return true;
1434 }
1435
1436 static void
update_cs_workgroup_threads(struct intel_device_info * devinfo)1437 update_cs_workgroup_threads(struct intel_device_info *devinfo)
1438 {
1439 /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
1440 * can program is 64 without going up to a rectangular group. This only
1441 * impacts Haswell and TGL which have higher thread counts.
1442 *
1443 * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
1444 * is 10 bits so we have no such restrictions.
1445 */
1446 devinfo->max_cs_workgroup_threads =
1447 devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
1448 MIN2(devinfo->max_cs_threads, 64);
1449 }
1450
1451 bool
intel_get_device_info_from_pci_id(int pci_id,struct intel_device_info * devinfo)1452 intel_get_device_info_from_pci_id(int pci_id,
1453 struct intel_device_info *devinfo)
1454 {
1455 switch (pci_id) {
1456 #undef CHIPSET
1457 #define CHIPSET(id, family, fam_str, name) \
1458 case id: *devinfo = intel_device_info_##family; break;
1459 #include "pci_ids/crocus_pci_ids.h"
1460 #include "pci_ids/iris_pci_ids.h"
1461
1462 #undef CHIPSET
1463 #define CHIPSET(id, fam_str, name) \
1464 case id: *devinfo = intel_device_info_gfx3; break;
1465 #include "pci_ids/i915_pci_ids.h"
1466
1467 default:
1468 mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id);
1469 return false;
1470 }
1471
1472 switch (pci_id) {
1473 #undef CHIPSET
1474 #define CHIPSET(_id, _family, _fam_str, _name) \
1475 case _id: \
1476 /* sizeof(str_literal) includes the null */ \
1477 STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \
1478 sizeof(devinfo->name)); \
1479 strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \
1480 break;
1481 #include "pci_ids/crocus_pci_ids.h"
1482 #include "pci_ids/iris_pci_ids.h"
1483 default:
1484 strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name));
1485 }
1486
1487 fill_masks(devinfo);
1488
1489 /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
1490 *
1491 * "Scratch Space per slice is computed based on 4 sub-slices. SW must
1492 * allocate scratch space enough so that each slice has 4 slices allowed."
1493 *
1494 * The equivalent internal documentation says that this programming note
1495 * applies to all Gfx9+ platforms.
1496 *
1497 * The hardware typically calculates the scratch space pointer by taking
1498 * the base address, and adding per-thread-scratch-space * thread ID.
1499 * Extra padding can be necessary depending how the thread IDs are
1500 * calculated for a particular shader stage.
1501 */
1502
1503 switch(devinfo->ver) {
1504 case 9:
1505 devinfo->max_wm_threads = 64 /* threads-per-PSD */
1506 * devinfo->num_slices
1507 * 4; /* effective subslices per slice */
1508 break;
1509 case 11:
1510 case 12:
1511 devinfo->max_wm_threads = 128 /* threads-per-PSD */
1512 * devinfo->num_slices
1513 * 8; /* subslices per slice */
1514 break;
1515 default:
1516 assert(devinfo->ver < 9);
1517 break;
1518 }
1519
1520 assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
1521
1522 if (devinfo->verx10 == 0)
1523 devinfo->verx10 = devinfo->ver * 10;
1524
1525 if (devinfo->display_ver == 0)
1526 devinfo->display_ver = devinfo->ver;
1527
1528 update_cs_workgroup_threads(devinfo);
1529
1530 return true;
1531 }
1532
1533 /**
1534 * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology
1535 * (kernel 4.13+)
1536 */
1537 static bool
getparam_topology(struct intel_device_info * devinfo,int fd)1538 getparam_topology(struct intel_device_info *devinfo, int fd)
1539 {
1540 int slice_mask = 0;
1541 if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask))
1542 goto maybe_warn;
1543
1544 int n_eus;
1545 if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus))
1546 goto maybe_warn;
1547
1548 int subslice_mask = 0;
1549 if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask))
1550 goto maybe_warn;
1551
1552 return update_from_masks(devinfo, slice_mask, subslice_mask, n_eus);
1553
1554 maybe_warn:
1555 /* Only with Gfx8+ are we starting to see devices with fusing that can only
1556 * be detected at runtime.
1557 */
1558 if (devinfo->ver >= 8)
1559 mesa_logw("Kernel 4.1 required to properly query GPU properties.");
1560
1561 return false;
1562 }
1563
1564 /**
1565 * preferred API for updating the topology in devinfo (kernel 4.17+)
1566 */
1567 static bool
query_topology(struct intel_device_info * devinfo,int fd)1568 query_topology(struct intel_device_info *devinfo, int fd)
1569 {
1570 struct drm_i915_query_topology_info *topo_info =
1571 intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO, NULL);
1572 if (topo_info == NULL)
1573 return false;
1574
1575 if (devinfo->verx10 >= 125) {
1576 struct drm_i915_query_topology_info *geom_topo_info =
1577 intel_i915_query_alloc(fd, DRM_I915_QUERY_GEOMETRY_SUBSLICES, NULL);
1578 if (geom_topo_info == NULL) {
1579 free(topo_info);
1580 return false;
1581 }
1582
1583 update_from_single_slice_topology(devinfo, topo_info, geom_topo_info);
1584 free(geom_topo_info);
1585 } else {
1586 update_from_topology(devinfo, topo_info);
1587 }
1588
1589 free(topo_info);
1590
1591 return true;
1592
1593 }
1594
1595 /**
1596 * Reports memory region info, and allows buffers to target system-memory,
1597 * and/or device local memory.
1598 */
1599 static bool
query_regions(struct intel_device_info * devinfo,int fd,bool update)1600 query_regions(struct intel_device_info *devinfo, int fd, bool update)
1601 {
1602 struct drm_i915_query_memory_regions *meminfo =
1603 intel_i915_query_alloc(fd, DRM_I915_QUERY_MEMORY_REGIONS, NULL);
1604 if (meminfo == NULL)
1605 return false;
1606
1607 for (int i = 0; i < meminfo->num_regions; i++) {
1608 const struct drm_i915_memory_region_info *mem = &meminfo->regions[i];
1609 switch (mem->region.memory_class) {
1610 case I915_MEMORY_CLASS_SYSTEM: {
1611 if (!update) {
1612 devinfo->mem.sram.mem_class = mem->region.memory_class;
1613 devinfo->mem.sram.mem_instance = mem->region.memory_instance;
1614 devinfo->mem.sram.mappable.size = mem->probed_size;
1615 } else {
1616 assert(devinfo->mem.sram.mem_class == mem->region.memory_class);
1617 assert(devinfo->mem.sram.mem_instance == mem->region.memory_instance);
1618 assert(devinfo->mem.sram.mappable.size == mem->probed_size);
1619 }
1620 /* The kernel uAPI only reports an accurate unallocated_size value
1621 * for I915_MEMORY_CLASS_DEVICE.
1622 */
1623 uint64_t available;
1624 if (os_get_available_system_memory(&available))
1625 devinfo->mem.sram.mappable.free = MIN2(available, mem->probed_size);
1626 break;
1627 }
1628 case I915_MEMORY_CLASS_DEVICE:
1629 if (!update) {
1630 devinfo->mem.vram.mem_class = mem->region.memory_class;
1631 devinfo->mem.vram.mem_instance = mem->region.memory_instance;
1632 if (mem->probed_cpu_visible_size > 0) {
1633 devinfo->mem.vram.mappable.size = mem->probed_cpu_visible_size;
1634 devinfo->mem.vram.unmappable.size =
1635 mem->probed_size - mem->probed_cpu_visible_size;
1636 } else {
1637 /* We are running on an older kernel without support for the
1638 * small-bar uapi. These kernels only support systems where the
1639 * entire vram is mappable.
1640 */
1641 devinfo->mem.vram.mappable.size = mem->probed_size;
1642 devinfo->mem.vram.unmappable.size = 0;
1643 }
1644 } else {
1645 assert(devinfo->mem.vram.mem_class == mem->region.memory_class);
1646 assert(devinfo->mem.vram.mem_instance == mem->region.memory_instance);
1647 assert((devinfo->mem.vram.mappable.size +
1648 devinfo->mem.vram.unmappable.size) == mem->probed_size);
1649 }
1650 if (mem->unallocated_cpu_visible_size > 0) {
1651 if (mem->unallocated_size != -1) {
1652 devinfo->mem.vram.mappable.free = mem->unallocated_cpu_visible_size;
1653 devinfo->mem.vram.unmappable.free =
1654 mem->unallocated_size - mem->unallocated_cpu_visible_size;
1655 }
1656 } else {
1657 /* We are running on an older kernel without support for the
1658 * small-bar uapi. These kernels only support systems where the
1659 * entire vram is mappable.
1660 */
1661 if (mem->unallocated_size != -1) {
1662 devinfo->mem.vram.mappable.free = mem->unallocated_size;
1663 devinfo->mem.vram.unmappable.free = 0;
1664 }
1665 }
1666 break;
1667 default:
1668 break;
1669 }
1670 }
1671
1672 free(meminfo);
1673 devinfo->mem.use_class_instance = true;
1674 return true;
1675 }
1676
1677 static bool
compute_system_memory(struct intel_device_info * devinfo,bool update)1678 compute_system_memory(struct intel_device_info *devinfo, bool update)
1679 {
1680 uint64_t total_phys;
1681 if (!os_get_total_physical_memory(&total_phys))
1682 return false;
1683
1684 uint64_t available = 0;
1685 os_get_available_system_memory(&available);
1686
1687 if (!update)
1688 devinfo->mem.sram.mappable.size = total_phys;
1689 else
1690 assert(devinfo->mem.sram.mappable.size == total_phys);
1691
1692 devinfo->mem.sram.mappable.free = available;
1693
1694 return true;
1695 }
1696
1697 static int
intel_get_aperture_size(int fd,uint64_t * size)1698 intel_get_aperture_size(int fd, uint64_t *size)
1699 {
1700 struct drm_i915_gem_get_aperture aperture = { 0 };
1701
1702 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
1703 if (ret == 0 && size)
1704 *size = aperture.aper_size;
1705
1706 return ret;
1707 }
1708
1709 static bool
has_bit6_swizzle(int fd)1710 has_bit6_swizzle(int fd)
1711 {
1712 struct drm_gem_close close;
1713 int ret;
1714
1715 struct drm_i915_gem_create gem_create = {
1716 .size = 4096,
1717 };
1718
1719 if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1720 unreachable("Failed to create GEM BO");
1721 return false;
1722 }
1723
1724 bool swizzled = false;
1725
1726 /* set_tiling overwrites the input on the error path, so we have to open
1727 * code intel_ioctl.
1728 */
1729 do {
1730 struct drm_i915_gem_set_tiling set_tiling = {
1731 .handle = gem_create.handle,
1732 .tiling_mode = I915_TILING_X,
1733 .stride = 512,
1734 };
1735
1736 ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
1737 } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
1738
1739 if (ret != 0) {
1740 unreachable("Failed to set BO tiling");
1741 goto close_and_return;
1742 }
1743
1744 struct drm_i915_gem_get_tiling get_tiling = {
1745 .handle = gem_create.handle,
1746 };
1747
1748 if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
1749 unreachable("Failed to get BO tiling");
1750 goto close_and_return;
1751 }
1752
1753 assert(get_tiling.tiling_mode == I915_TILING_X);
1754 swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE;
1755
1756 close_and_return:
1757 memset(&close, 0, sizeof(close));
1758 close.handle = gem_create.handle;
1759 intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1760
1761 return swizzled;
1762 }
1763
1764 static bool
has_get_tiling(int fd)1765 has_get_tiling(int fd)
1766 {
1767 int ret;
1768
1769 struct drm_i915_gem_create gem_create = {
1770 .size = 4096,
1771 };
1772
1773 if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1774 unreachable("Failed to create GEM BO");
1775 return false;
1776 }
1777
1778 struct drm_i915_gem_get_tiling get_tiling = {
1779 .handle = gem_create.handle,
1780 };
1781 ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &get_tiling);
1782
1783 struct drm_gem_close close = {
1784 .handle = gem_create.handle,
1785 };
1786 intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1787
1788 return ret == 0;
1789 }
1790
1791 static void
fixup_chv_device_info(struct intel_device_info * devinfo)1792 fixup_chv_device_info(struct intel_device_info *devinfo)
1793 {
1794 assert(devinfo->platform == INTEL_PLATFORM_CHV);
1795
1796 /* Cherryview is annoying. The number of EUs is depending on fusing and
1797 * isn't determinable from the PCI ID alone. We default to the minimum
1798 * available for that PCI ID and then compute the real value from the
1799 * subslice information we get from the kernel.
1800 */
1801 const uint32_t subslice_total = intel_device_info_subslice_total(devinfo);
1802 const uint32_t eu_total = intel_device_info_eu_total(devinfo);
1803
1804 /* Logical CS threads = EUs per subslice * num threads per EU */
1805 uint32_t max_cs_threads =
1806 eu_total / subslice_total * devinfo->num_thread_per_eu;
1807
1808 /* Fuse configurations may give more threads than expected, never less. */
1809 if (max_cs_threads > devinfo->max_cs_threads)
1810 devinfo->max_cs_threads = max_cs_threads;
1811
1812 update_cs_workgroup_threads(devinfo);
1813
1814 /* Braswell is even more annoying. Its marketing name isn't determinable
1815 * from the PCI ID and is also dependent on fusing.
1816 */
1817 if (devinfo->pci_device_id != 0x22B1)
1818 return;
1819
1820 char *bsw_model;
1821 switch (eu_total) {
1822 case 16: bsw_model = "405"; break;
1823 case 12: bsw_model = "400"; break;
1824 default: bsw_model = " "; break;
1825 }
1826
1827 char *needle = strstr(devinfo->name, "XXX");
1828 assert(needle);
1829 if (needle)
1830 memcpy(needle, bsw_model, 3);
1831 }
1832
1833 static void
init_max_scratch_ids(struct intel_device_info * devinfo)1834 init_max_scratch_ids(struct intel_device_info *devinfo)
1835 {
1836 /* Determine the max number of subslices that potentially might be used in
1837 * scratch space ids.
1838 *
1839 * For, Gfx11+, scratch space allocation is based on the number of threads
1840 * in the base configuration.
1841 *
1842 * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
1843 * we wish to view that there are 4 subslices per slice instead of the
1844 * actual number of subslices per slice. The documentation for 3DSTATE_PS
1845 * "Scratch Space Base Pointer" says:
1846 *
1847 * "Scratch Space per slice is computed based on 4 sub-slices. SW
1848 * must allocate scratch space enough so that each slice has 4
1849 * slices allowed."
1850 *
1851 * According to the other driver team, this applies to compute shaders
1852 * as well. This is not currently documented at all.
1853 *
1854 * For Gfx8 and older we user devinfo->subslice_total.
1855 */
1856 unsigned subslices;
1857 if (devinfo->verx10 == 125)
1858 subslices = 32;
1859 else if (devinfo->ver == 12)
1860 subslices = (devinfo->platform == INTEL_PLATFORM_DG1 || devinfo->gt == 2 ? 6 : 2);
1861 else if (devinfo->ver == 11)
1862 subslices = 8;
1863 else if (devinfo->ver >= 9 && devinfo->ver < 11)
1864 subslices = 4 * devinfo->num_slices;
1865 else
1866 subslices = devinfo->subslice_total;
1867 assert(subslices >= devinfo->subslice_total);
1868
1869 unsigned scratch_ids_per_subslice;
1870 if (devinfo->ver >= 12) {
1871 /* Same as ICL below, but with 16 EUs. */
1872 scratch_ids_per_subslice = 16 * 8;
1873 } else if (devinfo->ver >= 11) {
1874 /* The MEDIA_VFE_STATE docs say:
1875 *
1876 * "Starting with this configuration, the Maximum Number of
1877 * Threads must be set to (#EU * 8) for GPGPU dispatches.
1878 *
1879 * Although there are only 7 threads per EU in the configuration,
1880 * the FFTID is calculated as if there are 8 threads per EU,
1881 * which in turn requires a larger amount of Scratch Space to be
1882 * allocated by the driver."
1883 */
1884 scratch_ids_per_subslice = 8 * 8;
1885 } else if (devinfo->platform == INTEL_PLATFORM_HSW) {
1886 /* WaCSScratchSize:hsw
1887 *
1888 * Haswell's scratch space address calculation appears to be sparse
1889 * rather than tightly packed. The Thread ID has bits indicating
1890 * which subslice, EU within a subslice, and thread within an EU it
1891 * is. There's a maximum of two slices and two subslices, so these
1892 * can be stored with a single bit. Even though there are only 10 EUs
1893 * per subslice, this is stored in 4 bits, so there's an effective
1894 * maximum value of 16 EUs. Similarly, although there are only 7
1895 * threads per EU, this is stored in a 3 bit number, giving an
1896 * effective maximum value of 8 threads per EU.
1897 *
1898 * This means that we need to use 16 * 8 instead of 10 * 7 for the
1899 * number of threads per subslice.
1900 */
1901 scratch_ids_per_subslice = 16 * 8;
1902 } else if (devinfo->platform == INTEL_PLATFORM_CHV) {
1903 /* Cherryview devices have either 6 or 8 EUs per subslice, and each
1904 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
1905 * as if it had 8 EUs.
1906 */
1907 scratch_ids_per_subslice = 8 * 7;
1908 } else {
1909 scratch_ids_per_subslice = devinfo->max_cs_threads;
1910 }
1911
1912 unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
1913
1914 if (devinfo->verx10 >= 125) {
1915 /* On GFX version 12.5, scratch access changed to a surface-based model.
1916 * Instead of each shader type having its own layout based on IDs passed
1917 * from the relevant fixed-function unit, all scratch access is based on
1918 * thread IDs like it always has been for compute.
1919 */
1920 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
1921 devinfo->max_scratch_ids[i] = max_thread_ids;
1922 } else {
1923 unsigned max_scratch_ids[] = {
1924 [MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
1925 [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
1926 [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
1927 [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
1928 [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
1929 [MESA_SHADER_COMPUTE] = max_thread_ids,
1930 };
1931 STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
1932 memcpy(devinfo->max_scratch_ids, max_scratch_ids,
1933 sizeof(devinfo->max_scratch_ids));
1934 }
1935 }
1936
1937 bool
intel_get_device_info_from_fd(int fd,struct intel_device_info * devinfo)1938 intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
1939 {
1940 /* Get PCI info.
1941 *
1942 * Some callers may already have a valid drm device which holds values of
1943 * PCI fields queried here prior to calling this function. But making this
1944 * query optional leads to a more cumbersome implementation. These callers
1945 * still need to initialize the fields somewhere out of this function and
1946 * rely on an ioctl to get PCI device id for the next step when skipping
1947 * this drm query.
1948 */
1949 drmDevicePtr drmdev = NULL;
1950 if (drmGetDevice2(fd, DRM_DEVICE_GET_PCI_REVISION, &drmdev)) {
1951 mesa_loge("Failed to query drm device.");
1952 return false;
1953 }
1954 if (!intel_get_device_info_from_pci_id
1955 (drmdev->deviceinfo.pci->device_id, devinfo)) {
1956 drmFreeDevice(&drmdev);
1957 return false;
1958 }
1959 devinfo->pci_domain = drmdev->businfo.pci->domain;
1960 devinfo->pci_bus = drmdev->businfo.pci->bus;
1961 devinfo->pci_dev = drmdev->businfo.pci->dev;
1962 devinfo->pci_func = drmdev->businfo.pci->func;
1963 devinfo->pci_device_id = drmdev->deviceinfo.pci->device_id;
1964 devinfo->pci_revision_id = drmdev->deviceinfo.pci->revision_id;
1965 drmFreeDevice(&drmdev);
1966 devinfo->no_hw = env_var_as_boolean("INTEL_NO_HW", false);
1967
1968 if (devinfo->ver == 10) {
1969 mesa_loge("Gfx10 support is redacted.");
1970 return false;
1971 }
1972
1973 /* remaining initializion queries the kernel for device info */
1974 if (devinfo->no_hw) {
1975 /* Provide some sensible values for NO_HW. */
1976 devinfo->gtt_size =
1977 devinfo->ver >= 8 ? (1ull << 48) : 2ull * 1024 * 1024 * 1024;
1978 compute_system_memory(devinfo, false);
1979 return true;
1980 }
1981
1982 if (intel_get_and_process_hwconfig_table(fd, devinfo)) {
1983 /* After applying hwconfig values, some items need to be recalculated. */
1984 devinfo->max_cs_threads =
1985 devinfo->max_eus_per_subslice * devinfo->num_thread_per_eu;
1986
1987 update_cs_workgroup_threads(devinfo);
1988 }
1989
1990 int timestamp_frequency;
1991 if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY,
1992 ×tamp_frequency))
1993 devinfo->timestamp_frequency = timestamp_frequency;
1994 else if (devinfo->ver >= 10) {
1995 mesa_loge("Kernel 4.15 required to read the CS timestamp frequency.");
1996 return false;
1997 }
1998
1999 if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision))
2000 devinfo->revision = 0;
2001
2002 if (!query_topology(devinfo, fd)) {
2003 if (devinfo->ver >= 10) {
2004 /* topology uAPI required for CNL+ (kernel 4.17+) */
2005 return false;
2006 }
2007
2008 /* else use the kernel 4.13+ api for gfx8+. For older kernels, topology
2009 * will be wrong, affecting GPU metrics. In this case, fail silently.
2010 */
2011 getparam_topology(devinfo, fd);
2012 }
2013
2014 /* If the memory region uAPI query is not available, try to generate some
2015 * numbers out of os_* utils for sram only.
2016 */
2017 if (!query_regions(devinfo, fd, false))
2018 compute_system_memory(devinfo, false);
2019
2020 /* region info is required for lmem support */
2021 if (devinfo->has_local_mem && !devinfo->mem.use_class_instance) {
2022 mesa_logw("Could not query local memory size.");
2023 return false;
2024 }
2025
2026 if (devinfo->platform == INTEL_PLATFORM_CHV)
2027 fixup_chv_device_info(devinfo);
2028
2029 /* Broadwell PRM says:
2030 *
2031 * "Before Gfx8, there was a historical configuration control field to
2032 * swizzle address bit[6] for in X/Y tiling modes. This was set in three
2033 * different places: TILECTL[1:0], ARB_MODE[5:4], and
2034 * DISP_ARB_CTL[14:13].
2035 *
2036 * For Gfx8 and subsequent generations, the swizzle fields are all
2037 * reserved, and the CPU's memory controller performs all address
2038 * swizzling modifications."
2039 */
2040 devinfo->has_bit6_swizzle = devinfo->ver < 8 && has_bit6_swizzle(fd);
2041
2042 intel_get_aperture_size(fd, &devinfo->aperture_bytes);
2043 get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE, &devinfo->gtt_size);
2044 devinfo->has_tiling_uapi = has_get_tiling(fd);
2045
2046 /* Gfx7 and older do not support EU/Subslice info */
2047 assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
2048 devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
2049
2050 init_max_scratch_ids(devinfo);
2051
2052 return true;
2053 }
2054
intel_device_info_update_memory_info(struct intel_device_info * devinfo,int fd)2055 bool intel_device_info_update_memory_info(struct intel_device_info *devinfo, int fd)
2056 {
2057 return query_regions(devinfo, fd, true) || compute_system_memory(devinfo, true);
2058 }
2059