• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2013 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 
31 #include <xf86drm.h>
32 
33 #include "intel_device_info.h"
34 #include "intel_hwconfig.h"
35 #include "intel/common/intel_gem.h"
36 #include "util/bitscan.h"
37 #include "util/debug.h"
38 #include "util/log.h"
39 #include "util/macros.h"
40 #include "util/os_misc.h"
41 
42 #include "drm-uapi/i915_drm.h"
43 
44 static const struct {
45    const char *name;
46    int pci_id;
47 } name_map[] = {
48    { "lpt", 0x27a2 },
49    { "brw", 0x2a02 },
50    { "g4x", 0x2a42 },
51    { "ilk", 0x0042 },
52    { "snb", 0x0126 },
53    { "ivb", 0x016a },
54    { "hsw", 0x0d2e },
55    { "byt", 0x0f33 },
56    { "bdw", 0x162e },
57    { "chv", 0x22B3 },
58    { "skl", 0x1912 },
59    { "bxt", 0x5A85 },
60    { "kbl", 0x5912 },
61    { "aml", 0x591C },
62    { "glk", 0x3185 },
63    { "cfl", 0x3E9B },
64    { "whl", 0x3EA1 },
65    { "cml", 0x9b41 },
66    { "icl", 0x8a52 },
67    { "ehl", 0x4500 },
68    { "jsl", 0x4E71 },
69    { "tgl", 0x9a49 },
70    { "rkl", 0x4c8a },
71    { "dg1", 0x4905 },
72    { "adl", 0x4680 },
73    { "sg1", 0x4907 },
74    { "rpl", 0xa780 },
75    { "dg2", 0x5690 },
76 };
77 
78 /**
79  * Get the PCI ID for the device name.
80  *
81  * Returns -1 if the device is not known.
82  */
83 int
intel_device_name_to_pci_device_id(const char * name)84 intel_device_name_to_pci_device_id(const char *name)
85 {
86    for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
87       if (!strcmp(name_map[i].name, name))
88          return name_map[i].pci_id;
89    }
90 
91    return -1;
92 }
93 
94 static const struct intel_device_info intel_device_info_gfx3 = {
95    .ver = 3,
96    .platform = INTEL_PLATFORM_GFX3,
97    .simulator_id = -1,
98    .num_slices = 1,
99    .num_subslices = { 1, },
100    .max_eus_per_subslice = 8,
101    .num_thread_per_eu = 4,
102    .timestamp_frequency = 12500000,
103    .cs_prefetch_size = 512,
104 };
105 
106 static const struct intel_device_info intel_device_info_i965 = {
107    .ver = 4,
108    .platform = INTEL_PLATFORM_I965,
109    .has_negative_rhw_bug = true,
110    .num_slices = 1,
111    .num_subslices = { 1, },
112    .max_eus_per_subslice = 8,
113    .num_thread_per_eu = 4,
114    .max_vs_threads = 16,
115    .max_gs_threads = 2,
116    .max_wm_threads = 8 * 4,
117    .urb = {
118       .size = 256,
119    },
120    .timestamp_frequency = 12500000,
121    .simulator_id = -1,
122    .cs_prefetch_size = 512,
123 };
124 
125 static const struct intel_device_info intel_device_info_g4x = {
126    .ver = 4,
127    .verx10 = 45,
128    .has_pln = true,
129    .has_compr4 = true,
130    .has_surface_tile_offset = true,
131    .platform = INTEL_PLATFORM_G4X,
132    .num_slices = 1,
133    .num_subslices = { 1, },
134    .max_eus_per_subslice = 10,
135    .num_thread_per_eu = 5,
136    .max_vs_threads = 32,
137    .max_gs_threads = 2,
138    .max_wm_threads = 10 * 5,
139    .urb = {
140       .size = 384,
141    },
142    .timestamp_frequency = 12500000,
143    .simulator_id = -1,
144    .cs_prefetch_size = 512,
145 };
146 
147 static const struct intel_device_info intel_device_info_ilk = {
148    .ver = 5,
149    .platform = INTEL_PLATFORM_ILK,
150    .has_pln = true,
151    .has_compr4 = true,
152    .has_surface_tile_offset = true,
153    .num_slices = 1,
154    .num_subslices = { 1, },
155    .max_eus_per_subslice = 12,
156    .num_thread_per_eu = 6,
157    .max_vs_threads = 72,
158    .max_gs_threads = 32,
159    .max_wm_threads = 12 * 6,
160    .urb = {
161       .size = 1024,
162    },
163    .timestamp_frequency = 12500000,
164    .simulator_id = -1,
165    .cs_prefetch_size = 512,
166 };
167 
168 static const struct intel_device_info intel_device_info_snb_gt1 = {
169    .ver = 6,
170    .gt = 1,
171    .platform = INTEL_PLATFORM_SNB,
172    .has_hiz_and_separate_stencil = true,
173    .has_llc = true,
174    .has_pln = true,
175    .has_surface_tile_offset = true,
176    .needs_unlit_centroid_workaround = true,
177    .num_slices = 1,
178    .num_subslices = { 1, },
179    .max_eus_per_subslice = 6,
180    .num_thread_per_eu = 6, /* Not confirmed */
181    .max_vs_threads = 24,
182    .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
183    .max_wm_threads = 40,
184    .urb = {
185       .size = 32,
186       .min_entries = {
187          [MESA_SHADER_VERTEX]   = 24,
188       },
189       .max_entries = {
190          [MESA_SHADER_VERTEX]   = 256,
191          [MESA_SHADER_GEOMETRY] = 256,
192       },
193    },
194    .timestamp_frequency = 12500000,
195    .simulator_id = -1,
196    .cs_prefetch_size = 512,
197 };
198 
199 static const struct intel_device_info intel_device_info_snb_gt2 = {
200    .ver = 6,
201    .gt = 2,
202    .platform = INTEL_PLATFORM_SNB,
203    .has_hiz_and_separate_stencil = true,
204    .has_llc = true,
205    .has_pln = true,
206    .has_surface_tile_offset = true,
207    .needs_unlit_centroid_workaround = true,
208    .num_slices = 1,
209    .num_subslices = { 1, },
210    .max_eus_per_subslice = 12,
211    .num_thread_per_eu = 6, /* Not confirmed */
212    .max_vs_threads = 60,
213    .max_gs_threads = 60,
214    .max_wm_threads = 80,
215    .urb = {
216       .size = 64,
217       .min_entries = {
218          [MESA_SHADER_VERTEX]   = 24,
219       },
220       .max_entries = {
221          [MESA_SHADER_VERTEX]   = 256,
222          [MESA_SHADER_GEOMETRY] = 256,
223       },
224    },
225    .timestamp_frequency = 12500000,
226    .simulator_id = -1,
227    .cs_prefetch_size = 512,
228 };
229 
230 #define GFX7_FEATURES                               \
231    .ver = 7,                                        \
232    .has_hiz_and_separate_stencil = true,            \
233    .must_use_separate_stencil = true,               \
234    .has_llc = true,                                 \
235    .has_pln = true,                                 \
236    .has_64bit_float = true,                         \
237    .has_surface_tile_offset = true,                 \
238    .timestamp_frequency = 12500000,                 \
239    .max_constant_urb_size_kb = 16,                  \
240    .cs_prefetch_size = 512
241 
242 static const struct intel_device_info intel_device_info_ivb_gt1 = {
243    GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 1,
244    .num_slices = 1,
245    .num_subslices = { 1, },
246    .max_eus_per_subslice = 6,
247    .num_thread_per_eu = 6,
248    .l3_banks = 2,
249    .max_vs_threads = 36,
250    .max_tcs_threads = 36,
251    .max_tes_threads = 36,
252    .max_gs_threads = 36,
253    .max_wm_threads = 48,
254    .max_cs_threads = 36,
255    .urb = {
256       .min_entries = {
257          [MESA_SHADER_VERTEX]    = 32,
258          [MESA_SHADER_TESS_EVAL] = 10,
259       },
260       .max_entries = {
261          [MESA_SHADER_VERTEX]    = 512,
262          [MESA_SHADER_TESS_CTRL] = 32,
263          [MESA_SHADER_TESS_EVAL] = 288,
264          [MESA_SHADER_GEOMETRY]  = 192,
265       },
266    },
267    .simulator_id = 7,
268 };
269 
270 static const struct intel_device_info intel_device_info_ivb_gt2 = {
271    GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 2,
272    .num_slices = 1,
273    .num_subslices = { 1, },
274    .max_eus_per_subslice = 12,
275    .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
276                             * @max_wm_threads ... */
277    .l3_banks = 4,
278    .max_vs_threads = 128,
279    .max_tcs_threads = 128,
280    .max_tes_threads = 128,
281    .max_gs_threads = 128,
282    .max_wm_threads = 172,
283    .max_cs_threads = 64,
284    .urb = {
285       .min_entries = {
286          [MESA_SHADER_VERTEX]    = 32,
287          [MESA_SHADER_TESS_EVAL] = 10,
288       },
289       .max_entries = {
290          [MESA_SHADER_VERTEX]    = 704,
291          [MESA_SHADER_TESS_CTRL] = 64,
292          [MESA_SHADER_TESS_EVAL] = 448,
293          [MESA_SHADER_GEOMETRY]  = 320,
294       },
295    },
296    .simulator_id = 7,
297 };
298 
299 static const struct intel_device_info intel_device_info_byt = {
300    GFX7_FEATURES, .platform = INTEL_PLATFORM_BYT, .gt = 1,
301    .num_slices = 1,
302    .num_subslices = { 1, },
303    .max_eus_per_subslice = 4,
304    .num_thread_per_eu = 8,
305    .l3_banks = 1,
306    .has_llc = false,
307    .max_vs_threads = 36,
308    .max_tcs_threads = 36,
309    .max_tes_threads = 36,
310    .max_gs_threads = 36,
311    .max_wm_threads = 48,
312    .max_cs_threads = 32,
313    .urb = {
314       .min_entries = {
315          [MESA_SHADER_VERTEX]    = 32,
316          [MESA_SHADER_TESS_EVAL] = 10,
317       },
318       .max_entries = {
319          [MESA_SHADER_VERTEX]    = 512,
320          [MESA_SHADER_TESS_CTRL] = 32,
321          [MESA_SHADER_TESS_EVAL] = 288,
322          [MESA_SHADER_GEOMETRY]  = 192,
323       },
324    },
325    .simulator_id = 10,
326 };
327 
328 #define HSW_FEATURES \
329    GFX7_FEATURES, \
330    .platform = INTEL_PLATFORM_HSW, \
331    .verx10 = 75, \
332    .supports_simd16_3src = true
333 
334 static const struct intel_device_info intel_device_info_hsw_gt1 = {
335    HSW_FEATURES, .gt = 1,
336    .num_slices = 1,
337    .num_subslices = { 1, },
338    .max_eus_per_subslice = 10,
339    .num_thread_per_eu = 7,
340    .l3_banks = 2,
341    .max_vs_threads = 70,
342    .max_tcs_threads = 70,
343    .max_tes_threads = 70,
344    .max_gs_threads = 70,
345    .max_wm_threads = 102,
346    .max_cs_threads = 70,
347    .urb = {
348       .min_entries = {
349          [MESA_SHADER_VERTEX]    = 32,
350          [MESA_SHADER_TESS_EVAL] = 10,
351       },
352       .max_entries = {
353          [MESA_SHADER_VERTEX]    = 640,
354          [MESA_SHADER_TESS_CTRL] = 64,
355          [MESA_SHADER_TESS_EVAL] = 384,
356          [MESA_SHADER_GEOMETRY]  = 256,
357       },
358    },
359    .simulator_id = 9,
360 };
361 
362 static const struct intel_device_info intel_device_info_hsw_gt2 = {
363    HSW_FEATURES, .gt = 2,
364    .num_slices = 1,
365    .num_subslices = { 2, },
366    .max_eus_per_subslice = 10,
367    .num_thread_per_eu = 7,
368    .l3_banks = 4,
369    .max_vs_threads = 280,
370    .max_tcs_threads = 256,
371    .max_tes_threads = 280,
372    .max_gs_threads = 256,
373    .max_wm_threads = 204,
374    .max_cs_threads = 70,
375    .urb = {
376       .min_entries = {
377          [MESA_SHADER_VERTEX]    = 64,
378          [MESA_SHADER_TESS_EVAL] = 10,
379       },
380       .max_entries = {
381          [MESA_SHADER_VERTEX]    = 1664,
382          [MESA_SHADER_TESS_CTRL] = 128,
383          [MESA_SHADER_TESS_EVAL] = 960,
384          [MESA_SHADER_GEOMETRY]  = 640,
385       },
386    },
387    .simulator_id = 9,
388 };
389 
390 static const struct intel_device_info intel_device_info_hsw_gt3 = {
391    HSW_FEATURES, .gt = 3,
392    .num_slices = 2,
393    .num_subslices = { 2, 2, },
394    .max_eus_per_subslice = 10,
395    .num_thread_per_eu = 7,
396    .l3_banks = 8,
397    .max_vs_threads = 280,
398    .max_tcs_threads = 256,
399    .max_tes_threads = 280,
400    .max_gs_threads = 256,
401    .max_wm_threads = 408,
402    .max_cs_threads = 70,
403    .urb = {
404       .min_entries = {
405          [MESA_SHADER_VERTEX]    = 64,
406          [MESA_SHADER_TESS_EVAL] = 10,
407       },
408       .max_entries = {
409          [MESA_SHADER_VERTEX]    = 1664,
410          [MESA_SHADER_TESS_CTRL] = 128,
411          [MESA_SHADER_TESS_EVAL] = 960,
412          [MESA_SHADER_GEOMETRY]  = 640,
413       },
414    },
415    .max_constant_urb_size_kb = 32,
416    .simulator_id = 9,
417 };
418 
419 /* It's unclear how well supported sampling from the hiz buffer is on GFX8,
420  * so keep things conservative for now and set has_sample_with_hiz = false.
421  */
422 #define GFX8_FEATURES                               \
423    .ver = 8,                                        \
424    .has_hiz_and_separate_stencil = true,            \
425    .must_use_separate_stencil = true,               \
426    .has_llc = true,                                 \
427    .has_sample_with_hiz = false,                    \
428    .has_pln = true,                                 \
429    .has_integer_dword_mul = true,                   \
430    .has_64bit_float = true,                         \
431    .has_64bit_int = true,                           \
432    .supports_simd16_3src = true,                    \
433    .has_surface_tile_offset = true,                 \
434    .num_thread_per_eu = 7,                          \
435    .max_vs_threads = 504,                           \
436    .max_tcs_threads = 504,                          \
437    .max_tes_threads = 504,                          \
438    .max_gs_threads = 504,                           \
439    .max_wm_threads = 384,                           \
440    .max_threads_per_psd = 64,                       \
441    .timestamp_frequency = 12500000,                 \
442    .max_constant_urb_size_kb = 32,                  \
443    .cs_prefetch_size = 512
444 
445 static const struct intel_device_info intel_device_info_bdw_gt1 = {
446    GFX8_FEATURES, .gt = 1,
447    .platform = INTEL_PLATFORM_BDW,
448    .num_slices = 1,
449    .num_subslices = { 2, },
450    .max_eus_per_subslice = 6,
451    .l3_banks = 2,
452    .max_cs_threads = 42,
453    .urb = {
454       .min_entries = {
455          [MESA_SHADER_VERTEX]    = 64,
456          [MESA_SHADER_TESS_EVAL] = 34,
457       },
458       .max_entries = {
459          [MESA_SHADER_VERTEX]    = 2560,
460          [MESA_SHADER_TESS_CTRL] = 504,
461          [MESA_SHADER_TESS_EVAL] = 1536,
462          /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */
463          [MESA_SHADER_GEOMETRY]  = 690,
464       },
465    },
466    .simulator_id = 11,
467 };
468 
469 static const struct intel_device_info intel_device_info_bdw_gt2 = {
470    GFX8_FEATURES, .gt = 2,
471    .platform = INTEL_PLATFORM_BDW,
472    .num_slices = 1,
473    .num_subslices = { 3, },
474    .max_eus_per_subslice = 8,
475    .l3_banks = 4,
476    .max_cs_threads = 56,
477    .urb = {
478       .min_entries = {
479          [MESA_SHADER_VERTEX]    = 64,
480          [MESA_SHADER_TESS_EVAL] = 34,
481       },
482       .max_entries = {
483          [MESA_SHADER_VERTEX]    = 2560,
484          [MESA_SHADER_TESS_CTRL] = 504,
485          [MESA_SHADER_TESS_EVAL] = 1536,
486          [MESA_SHADER_GEOMETRY]  = 960,
487       },
488    },
489    .simulator_id = 11,
490 };
491 
492 static const struct intel_device_info intel_device_info_bdw_gt3 = {
493    GFX8_FEATURES, .gt = 3,
494    .platform = INTEL_PLATFORM_BDW,
495    .num_slices = 2,
496    .num_subslices = { 3, 3, },
497    .max_eus_per_subslice = 8,
498    .l3_banks = 8,
499    .max_cs_threads = 56,
500    .urb = {
501       .min_entries = {
502          [MESA_SHADER_VERTEX]    = 64,
503          [MESA_SHADER_TESS_EVAL] = 34,
504       },
505       .max_entries = {
506          [MESA_SHADER_VERTEX]    = 2560,
507          [MESA_SHADER_TESS_CTRL] = 504,
508          [MESA_SHADER_TESS_EVAL] = 1536,
509          [MESA_SHADER_GEOMETRY]  = 960,
510       },
511    },
512    .simulator_id = 11,
513 };
514 
515 static const struct intel_device_info intel_device_info_chv = {
516    GFX8_FEATURES, .platform = INTEL_PLATFORM_CHV, .gt = 1,
517    .has_llc = false,
518    .has_integer_dword_mul = false,
519    .num_slices = 1,
520    .num_subslices = { 2, },
521    .max_eus_per_subslice = 8,
522    .l3_banks = 2,
523    .max_vs_threads = 80,
524    .max_tcs_threads = 80,
525    .max_tes_threads = 80,
526    .max_gs_threads = 80,
527    .max_wm_threads = 128,
528    .max_cs_threads = 6 * 7,
529    .urb = {
530       .min_entries = {
531          [MESA_SHADER_VERTEX]    = 34,
532          [MESA_SHADER_TESS_EVAL] = 34,
533       },
534       .max_entries = {
535          [MESA_SHADER_VERTEX]    = 640,
536          [MESA_SHADER_TESS_CTRL] = 80,
537          [MESA_SHADER_TESS_EVAL] = 384,
538          [MESA_SHADER_GEOMETRY]  = 256,
539       },
540    },
541    .simulator_id = 13,
542 };
543 
544 #define GFX9_HW_INFO                                \
545    .ver = 9,                                        \
546    .max_vs_threads = 336,                           \
547    .max_gs_threads = 336,                           \
548    .max_tcs_threads = 336,                          \
549    .max_tes_threads = 336,                          \
550    .max_threads_per_psd = 64,                       \
551    .max_cs_threads = 56,                            \
552    .timestamp_frequency = 12000000,                 \
553    .cs_prefetch_size = 512,                         \
554    .urb = {                                         \
555       .min_entries = {                              \
556          [MESA_SHADER_VERTEX]    = 64,              \
557          [MESA_SHADER_TESS_EVAL] = 34,              \
558       },                                            \
559       .max_entries = {                              \
560          [MESA_SHADER_VERTEX]    = 1856,            \
561          [MESA_SHADER_TESS_CTRL] = 672,             \
562          [MESA_SHADER_TESS_EVAL] = 1120,            \
563          [MESA_SHADER_GEOMETRY]  = 640,             \
564       },                                            \
565    }
566 
567 #define GFX9_LP_FEATURES                           \
568    GFX8_FEATURES,                                  \
569    GFX9_HW_INFO,                                   \
570    .has_integer_dword_mul = false,                 \
571    .gt = 1,                                        \
572    .has_llc = false,                               \
573    .has_sample_with_hiz = true,                    \
574    .num_slices = 1,                                \
575    .num_thread_per_eu = 6,                         \
576    .max_vs_threads = 112,                          \
577    .max_tcs_threads = 112,                         \
578    .max_tes_threads = 112,                         \
579    .max_gs_threads = 112,                          \
580    .max_cs_threads = 6 * 6,                        \
581    .timestamp_frequency = 19200000,                \
582    .urb = {                                        \
583       .min_entries = {                             \
584          [MESA_SHADER_VERTEX]    = 34,             \
585          [MESA_SHADER_TESS_EVAL] = 34,             \
586       },                                           \
587       .max_entries = {                             \
588          [MESA_SHADER_VERTEX]    = 704,            \
589          [MESA_SHADER_TESS_CTRL] = 256,            \
590          [MESA_SHADER_TESS_EVAL] = 416,            \
591          [MESA_SHADER_GEOMETRY]  = 256,            \
592       },                                           \
593    }
594 
595 #define GFX9_LP_FEATURES_3X6                       \
596    GFX9_LP_FEATURES,                               \
597    .num_subslices = { 3, },                        \
598    .max_eus_per_subslice = 6
599 
600 #define GFX9_LP_FEATURES_2X6                       \
601    GFX9_LP_FEATURES,                               \
602    .num_subslices = { 2, },                        \
603    .max_eus_per_subslice = 6,                       \
604    .max_vs_threads = 56,                           \
605    .max_tcs_threads = 56,                          \
606    .max_tes_threads = 56,                          \
607    .max_gs_threads = 56,                           \
608    .max_cs_threads = 6 * 6,                        \
609    .urb = {                                        \
610       .min_entries = {                             \
611          [MESA_SHADER_VERTEX]    = 34,             \
612          [MESA_SHADER_TESS_EVAL] = 34,             \
613       },                                           \
614       .max_entries = {                             \
615          [MESA_SHADER_VERTEX]    = 352,            \
616          [MESA_SHADER_TESS_CTRL] = 128,            \
617          [MESA_SHADER_TESS_EVAL] = 208,            \
618          [MESA_SHADER_GEOMETRY]  = 128,            \
619       },                                           \
620    }
621 
622 #define GFX9_FEATURES                               \
623    GFX8_FEATURES,                                   \
624    GFX9_HW_INFO,                                    \
625    .has_sample_with_hiz = true
626 
627 static const struct intel_device_info intel_device_info_skl_gt1 = {
628    GFX9_FEATURES, .gt = 1,
629    .platform = INTEL_PLATFORM_SKL,
630    .num_slices = 1,
631    .num_subslices = { 2, },
632    .max_eus_per_subslice = 6,
633    .l3_banks = 2,
634    /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
635     * leading to some vertices to go missing if we use too much URB.
636     */
637    .urb.max_entries[MESA_SHADER_VERTEX] = 928,
638    .simulator_id = 12,
639 };
640 
641 static const struct intel_device_info intel_device_info_skl_gt2 = {
642    GFX9_FEATURES, .gt = 2,
643    .platform = INTEL_PLATFORM_SKL,
644    .num_slices = 1,
645    .num_subslices = { 3, },
646    .max_eus_per_subslice = 8,
647    .l3_banks = 4,
648    .simulator_id = 12,
649 };
650 
651 static const struct intel_device_info intel_device_info_skl_gt3 = {
652    GFX9_FEATURES, .gt = 3,
653    .platform = INTEL_PLATFORM_SKL,
654    .num_slices = 2,
655    .num_subslices = { 3, 3, },
656    .max_eus_per_subslice = 8,
657    .l3_banks = 8,
658    .simulator_id = 12,
659 };
660 
661 static const struct intel_device_info intel_device_info_skl_gt4 = {
662    GFX9_FEATURES, .gt = 4,
663    .platform = INTEL_PLATFORM_SKL,
664    .num_slices = 3,
665    .num_subslices = { 3, 3, 3, },
666    .max_eus_per_subslice = 8,
667    .l3_banks = 12,
668    /* From the "L3 Allocation and Programming" documentation:
669     *
670     * "URB is limited to 1008KB due to programming restrictions.  This is not a
671     * restriction of the L3 implementation, but of the FF and other clients.
672     * Therefore, in a GT4 implementation it is possible for the programmed
673     * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
674     * only 1008KB of this will be used."
675     */
676    .simulator_id = 12,
677 };
678 
679 static const struct intel_device_info intel_device_info_bxt = {
680    GFX9_LP_FEATURES_3X6,
681    .platform = INTEL_PLATFORM_BXT,
682    .l3_banks = 2,
683    .simulator_id = 14,
684 };
685 
686 static const struct intel_device_info intel_device_info_bxt_2x6 = {
687    GFX9_LP_FEATURES_2X6,
688    .platform = INTEL_PLATFORM_BXT,
689    .l3_banks = 1,
690    .simulator_id = 14,
691 };
692 /*
693  * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
694  * There's no KBL entry. Using the default SKL (GFX9) GS entries value.
695  */
696 
697 static const struct intel_device_info intel_device_info_kbl_gt1 = {
698    GFX9_FEATURES,
699    .platform = INTEL_PLATFORM_KBL,
700    .gt = 1,
701 
702    .max_cs_threads = 7 * 6,
703    .num_slices = 1,
704    .num_subslices = { 2, },
705    .max_eus_per_subslice = 6,
706    .l3_banks = 2,
707    /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
708     * leading to some vertices to go missing if we use too much URB.
709     */
710    .urb.max_entries[MESA_SHADER_VERTEX] = 928,
711    .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
712    .simulator_id = 16,
713 };
714 
715 static const struct intel_device_info intel_device_info_kbl_gt1_5 = {
716    GFX9_FEATURES,
717    .platform = INTEL_PLATFORM_KBL,
718    .gt = 1,
719 
720    .max_cs_threads = 7 * 6,
721    .num_slices = 1,
722    .num_subslices = { 3, },
723    .max_eus_per_subslice = 6,
724    .l3_banks = 4,
725    .simulator_id = 16,
726 };
727 
728 static const struct intel_device_info intel_device_info_kbl_gt2 = {
729    GFX9_FEATURES,
730    .platform = INTEL_PLATFORM_KBL,
731    .gt = 2,
732 
733    .num_slices = 1,
734    .num_subslices = { 3, },
735    .max_eus_per_subslice = 8,
736    .l3_banks = 4,
737    .simulator_id = 16,
738 };
739 
740 static const struct intel_device_info intel_device_info_kbl_gt3 = {
741    GFX9_FEATURES,
742    .platform = INTEL_PLATFORM_KBL,
743    .gt = 3,
744 
745    .num_slices = 2,
746    .num_subslices = { 3, 3, },
747    .max_eus_per_subslice = 8,
748    .l3_banks = 8,
749    .simulator_id = 16,
750 };
751 
752 static const struct intel_device_info intel_device_info_kbl_gt4 = {
753    GFX9_FEATURES,
754    .platform = INTEL_PLATFORM_KBL,
755    .gt = 4,
756 
757    /*
758     * From the "L3 Allocation and Programming" documentation:
759     *
760     * "URB is limited to 1008KB due to programming restrictions.  This
761     *  is not a restriction of the L3 implementation, but of the FF and
762     *  other clients.  Therefore, in a GT4 implementation it is
763     *  possible for the programmed allocation of the L3 data array to
764     *  provide 3*384KB=1152KB for URB, but only 1008KB of this
765     *  will be used."
766     */
767    .num_slices = 3,
768    .num_subslices = { 3, 3, 3, },
769    .max_eus_per_subslice = 8,
770    .l3_banks = 12,
771    .simulator_id = 16,
772 };
773 
774 static const struct intel_device_info intel_device_info_glk = {
775    GFX9_LP_FEATURES_3X6,
776    .platform = INTEL_PLATFORM_GLK,
777    .l3_banks = 2,
778    .simulator_id = 17,
779 };
780 
781 static const struct intel_device_info intel_device_info_glk_2x6 = {
782    GFX9_LP_FEATURES_2X6,
783    .platform = INTEL_PLATFORM_GLK,
784    .l3_banks = 2,
785    .simulator_id = 17,
786 };
787 
788 static const struct intel_device_info intel_device_info_cfl_gt1 = {
789    GFX9_FEATURES,
790    .platform = INTEL_PLATFORM_CFL,
791    .gt = 1,
792 
793    .num_slices = 1,
794    .num_subslices = { 2, },
795    .max_eus_per_subslice = 6,
796    .l3_banks = 2,
797    /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
798     * leading to some vertices to go missing if we use too much URB.
799     */
800    .urb.max_entries[MESA_SHADER_VERTEX] = 928,
801    .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
802    .simulator_id = 24,
803 };
804 static const struct intel_device_info intel_device_info_cfl_gt2 = {
805    GFX9_FEATURES,
806    .platform = INTEL_PLATFORM_CFL,
807    .gt = 2,
808 
809    .num_slices = 1,
810    .num_subslices = { 3, },
811    .max_eus_per_subslice = 8,
812    .l3_banks = 4,
813    .simulator_id = 24,
814 };
815 
816 static const struct intel_device_info intel_device_info_cfl_gt3 = {
817    GFX9_FEATURES,
818    .platform = INTEL_PLATFORM_CFL,
819    .gt = 3,
820 
821    .num_slices = 2,
822    .num_subslices = { 3, 3, },
823    .max_eus_per_subslice = 8,
824    .l3_banks = 8,
825    .simulator_id = 24,
826 };
827 
828 #define subslices(args...) { args, }
829 
830 #define GFX11_HW_INFO                               \
831    .ver = 11,                                       \
832    .has_pln = false,                                \
833    .max_vs_threads = 364,                           \
834    .max_gs_threads = 224,                           \
835    .max_tcs_threads = 224,                          \
836    .max_tes_threads = 364,                          \
837    .max_threads_per_psd = 64,                       \
838    .max_cs_threads = 56,                            \
839    .cs_prefetch_size = 512
840 
841 #define GFX11_FEATURES(_gt, _slices, _subslices, _l3, _platform)  \
842    GFX8_FEATURES,                                     \
843    GFX11_HW_INFO,                                     \
844    .platform = _platform,                             \
845    .has_64bit_float = false,                          \
846    .has_64bit_int = false,                            \
847    .has_integer_dword_mul = false,                    \
848    .has_sample_with_hiz = false,                      \
849    .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
850    .num_subslices = _subslices,                       \
851    .max_eus_per_subslice = 8
852 
853 #define GFX11_URB_MIN_MAX_ENTRIES                     \
854    .min_entries = {                                   \
855       [MESA_SHADER_VERTEX]    = 64,                   \
856       [MESA_SHADER_TESS_EVAL] = 34,                   \
857    },                                                 \
858    .max_entries = {                                   \
859       [MESA_SHADER_VERTEX]    = 2384,                 \
860       [MESA_SHADER_TESS_CTRL] = 1032,                 \
861       [MESA_SHADER_TESS_EVAL] = 2384,                 \
862       [MESA_SHADER_GEOMETRY]  = 1032,                 \
863    }
864 
865 static const struct intel_device_info intel_device_info_icl_gt2 = {
866    GFX11_FEATURES(2, 1, subslices(8), 8, INTEL_PLATFORM_ICL),
867    .urb = {
868       GFX11_URB_MIN_MAX_ENTRIES,
869    },
870    .simulator_id = 19,
871 };
872 
873 static const struct intel_device_info intel_device_info_icl_gt1_5 = {
874    GFX11_FEATURES(1, 1, subslices(6), 6, INTEL_PLATFORM_ICL),
875    .urb = {
876       GFX11_URB_MIN_MAX_ENTRIES,
877    },
878    .simulator_id = 19,
879 };
880 
881 static const struct intel_device_info intel_device_info_icl_gt1 = {
882    GFX11_FEATURES(1, 1, subslices(4), 6, INTEL_PLATFORM_ICL),
883    .urb = {
884       GFX11_URB_MIN_MAX_ENTRIES,
885    },
886    .simulator_id = 19,
887 };
888 
889 static const struct intel_device_info intel_device_info_icl_gt0_5 = {
890    GFX11_FEATURES(1, 1, subslices(1), 6, INTEL_PLATFORM_ICL),
891    .urb = {
892       GFX11_URB_MIN_MAX_ENTRIES,
893    },
894    .simulator_id = 19,
895 };
896 
897 #define GFX11_LP_FEATURES                           \
898    .urb = {                                         \
899       GFX11_URB_MIN_MAX_ENTRIES,                    \
900    },                                               \
901    .disable_ccs_repack = true,                      \
902    .simulator_id = 28
903 
904 static const struct intel_device_info intel_device_info_ehl_4x8 = {
905    GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
906    GFX11_LP_FEATURES,
907 };
908 
909 static const struct intel_device_info intel_device_info_ehl_4x6 = {
910    GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
911    GFX11_LP_FEATURES,
912    .max_eus_per_subslice = 6,
913 };
914 
915 static const struct intel_device_info intel_device_info_ehl_4x5 = {
916    GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
917    GFX11_LP_FEATURES,
918    .max_eus_per_subslice = 5,
919 };
920 
921 static const struct intel_device_info intel_device_info_ehl_4x4 = {
922    GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
923    GFX11_LP_FEATURES,
924    .max_eus_per_subslice = 4,
925 };
926 
927 static const struct intel_device_info intel_device_info_ehl_2x8 = {
928    GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
929    GFX11_LP_FEATURES,
930 };
931 
932 static const struct intel_device_info intel_device_info_ehl_2x4 = {
933    GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
934    GFX11_LP_FEATURES,
935    .max_eus_per_subslice = 4,
936 };
937 
938 #define GFX12_URB_MIN_MAX_ENTRIES                   \
939    .min_entries = {                                 \
940       [MESA_SHADER_VERTEX]    = 64,                 \
941       [MESA_SHADER_TESS_EVAL] = 34,                 \
942    },                                               \
943    .max_entries = {                                 \
944       [MESA_SHADER_VERTEX]    = 3576,               \
945       [MESA_SHADER_TESS_CTRL] = 1548,               \
946       [MESA_SHADER_TESS_EVAL] = 3576,               \
947       /* Wa_14013840143 */                          \
948       [MESA_SHADER_GEOMETRY]  = 1536,               \
949    }
950 
951 #define GFX12_HW_INFO                               \
952    .ver = 12,                                       \
953    .has_pln = false,                                \
954    .has_sample_with_hiz = false,                    \
955    .has_aux_map = true,                             \
956    .max_vs_threads = 546,                           \
957    .max_gs_threads = 336,                           \
958    .max_tcs_threads = 336,                          \
959    .max_tes_threads = 546,                          \
960    .max_threads_per_psd = 64,                       \
961    .max_cs_threads = 112, /* threads per DSS */     \
962    .urb = {                                         \
963       GFX12_URB_MIN_MAX_ENTRIES,                    \
964    }
965 
966 #define GFX12_FEATURES(_gt, _slices, _l3)                       \
967    GFX8_FEATURES,                                               \
968    GFX12_HW_INFO,                                               \
969    .has_64bit_float = false,                                    \
970    .has_64bit_int = false,                                      \
971    .has_integer_dword_mul = false,                              \
972    .gt = _gt, .num_slices = _slices, .l3_banks = _l3,           \
973    .simulator_id = 22,                                          \
974    .max_eus_per_subslice = 16,                                   \
975    .cs_prefetch_size = 512
976 
977 #define dual_subslices(args...) { args, }
978 
979 #define GFX12_GT05_FEATURES                                     \
980    GFX12_FEATURES(1, 1, 4),                                     \
981    .num_subslices = dual_subslices(1)
982 
983 #define GFX12_GT_FEATURES(_gt)                                  \
984    GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8),                    \
985    .num_subslices = dual_subslices(_gt == 1 ? 2 : 6)
986 
987 static const struct intel_device_info intel_device_info_tgl_gt1 = {
988    GFX12_GT_FEATURES(1),
989    .platform = INTEL_PLATFORM_TGL,
990 };
991 
992 static const struct intel_device_info intel_device_info_tgl_gt2 = {
993    GFX12_GT_FEATURES(2),
994    .platform = INTEL_PLATFORM_TGL,
995 };
996 
997 static const struct intel_device_info intel_device_info_rkl_gt05 = {
998    GFX12_GT05_FEATURES,
999    .platform = INTEL_PLATFORM_RKL,
1000 };
1001 
1002 static const struct intel_device_info intel_device_info_rkl_gt1 = {
1003    GFX12_GT_FEATURES(1),
1004    .platform = INTEL_PLATFORM_RKL,
1005 };
1006 
1007 static const struct intel_device_info intel_device_info_adl_gt05 = {
1008    GFX12_GT05_FEATURES,
1009    .platform = INTEL_PLATFORM_ADL,
1010    .display_ver = 13,
1011 };
1012 
1013 static const struct intel_device_info intel_device_info_adl_gt1 = {
1014    GFX12_GT_FEATURES(1),
1015    .platform = INTEL_PLATFORM_ADL,
1016    .display_ver = 13,
1017 };
1018 
1019 static const struct intel_device_info intel_device_info_adl_n = {
1020    GFX12_GT_FEATURES(1),
1021    .platform = INTEL_PLATFORM_ADL,
1022    .display_ver = 13,
1023 };
1024 
1025 static const struct intel_device_info intel_device_info_adl_gt2 = {
1026    GFX12_GT_FEATURES(2),
1027    .platform = INTEL_PLATFORM_ADL,
1028    .display_ver = 13,
1029 };
1030 
1031 static const struct intel_device_info intel_device_info_rpl = {
1032    GFX12_FEATURES(1, 1, 4),
1033    .num_subslices = dual_subslices(2),
1034    .platform = INTEL_PLATFORM_RPL,
1035    .display_ver = 13,
1036 };
1037 
1038 static const struct intel_device_info intel_device_info_rpl_p = {
1039    GFX12_GT_FEATURES(2),
1040    .platform = INTEL_PLATFORM_RPL,
1041    .display_ver = 13,
1042 };
1043 
1044 #define GFX12_DG1_SG1_FEATURES                  \
1045    GFX12_GT_FEATURES(2),                        \
1046    .platform = INTEL_PLATFORM_DG1,              \
1047    .has_llc = false,                            \
1048    .has_local_mem = true,                       \
1049    .urb.size = 768,                             \
1050    .simulator_id = 30
1051 
1052 static const struct intel_device_info intel_device_info_dg1 = {
1053    GFX12_DG1_SG1_FEATURES,
1054 };
1055 
1056 static const struct intel_device_info intel_device_info_sg1 = {
1057    GFX12_DG1_SG1_FEATURES,
1058 };
1059 
1060 #define XEHP_FEATURES(_gt, _slices, _l3)                        \
1061    GFX12_FEATURES(_gt, _slices, _l3),                           \
1062    .num_thread_per_eu = 8 /* BSpec 44472 */,                    \
1063    .verx10 = 125,                                               \
1064    .has_llc = false,                                            \
1065    .has_local_mem = true,                                       \
1066    .has_aux_map = false,                                        \
1067    .simulator_id = 29,                                          \
1068    .cs_prefetch_size = 1024
1069 
1070 #define DG2_FEATURES                                            \
1071    /* (Sub)slice info comes from the kernel topology info */    \
1072    XEHP_FEATURES(0, 1, 0),                                      \
1073    .display_ver = 13,                                           \
1074    .revision = 4, /* For offline compiler */                    \
1075    .num_subslices = dual_subslices(1),                          \
1076    .has_lsc = true,                                             \
1077    .apply_hwconfig = true,                                      \
1078    .has_coarse_pixel_primitive_and_cb = true,                   \
1079    .has_mesh_shading = true
1080 
1081 static const struct intel_device_info intel_device_info_dg2_g10 = {
1082    DG2_FEATURES,
1083    .platform = INTEL_PLATFORM_DG2_G10,
1084 };
1085 
1086 static const struct intel_device_info intel_device_info_dg2_g11 = {
1087    DG2_FEATURES,
1088    .platform = INTEL_PLATFORM_DG2_G11,
1089 };
1090 
1091 static const struct intel_device_info intel_device_info_dg2_g12 = {
1092    DG2_FEATURES,
1093    .platform = INTEL_PLATFORM_DG2_G12,
1094 };
1095 
1096 static void
reset_masks(struct intel_device_info * devinfo)1097 reset_masks(struct intel_device_info *devinfo)
1098 {
1099    devinfo->subslice_slice_stride = 0;
1100    devinfo->eu_subslice_stride = 0;
1101    devinfo->eu_slice_stride = 0;
1102 
1103    devinfo->num_slices = 0;
1104    memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices));
1105 
1106    memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks));
1107    memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks));
1108    memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks));
1109    memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices));
1110 }
1111 
1112 static void
update_slice_subslice_counts(struct intel_device_info * devinfo)1113 update_slice_subslice_counts(struct intel_device_info *devinfo)
1114 {
1115    devinfo->num_slices = __builtin_popcount(devinfo->slice_masks);
1116    devinfo->subslice_total = 0;
1117    for (int s = 0; s < devinfo->max_slices; s++) {
1118       if (!intel_device_info_slice_available(devinfo, s))
1119          continue;
1120 
1121       for (int b = 0; b < devinfo->subslice_slice_stride; b++) {
1122          devinfo->num_subslices[s] +=
1123             __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]);
1124       }
1125       devinfo->subslice_total += devinfo->num_subslices[s];
1126    }
1127    assert(devinfo->num_slices > 0);
1128    assert(devinfo->subslice_total > 0);
1129 }
1130 
1131 static void
update_pixel_pipes(struct intel_device_info * devinfo,uint8_t * subslice_masks)1132 update_pixel_pipes(struct intel_device_info *devinfo, uint8_t *subslice_masks)
1133 {
1134    if (devinfo->ver < 11)
1135       return;
1136 
1137    /* The kernel only reports one slice on all existing ICL+ platforms, even
1138     * if multiple slices are present. The slice mask is allowed to have the
1139     * accurate value greater than 1 on gfx12.5+ platforms though, in order to
1140     * be tolerant with the behavior of our simulation environment.
1141     */
1142    assert(devinfo->slice_masks == 1 || devinfo->verx10 >= 125);
1143 
1144    /* Count the number of subslices on each pixel pipe. Assume that every
1145     * contiguous group of 4 subslices in the mask belong to the same pixel
1146     * pipe. However note that on TGL+ the kernel returns a mask of enabled
1147     * *dual* subslices instead of actual subslices somewhat confusingly, so
1148     * each pixel pipe only takes 2 bits in the mask even though it's still 4
1149     * subslices.
1150     */
1151    const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4;
1152    for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) {
1153       const unsigned offset = p * ppipe_bits;
1154       const unsigned subslice_idx = offset /
1155          devinfo->max_subslices_per_slice * devinfo->subslice_slice_stride;
1156       const unsigned ppipe_mask =
1157          BITFIELD_RANGE(offset % devinfo->max_subslices_per_slice, ppipe_bits);
1158 
1159       if (subslice_idx < ARRAY_SIZE(devinfo->subslice_masks))
1160          devinfo->ppipe_subslices[p] =
1161             __builtin_popcount(subslice_masks[subslice_idx] & ppipe_mask);
1162       else
1163          devinfo->ppipe_subslices[p] = 0;
1164    }
1165 }
1166 
1167 static void
update_l3_banks(struct intel_device_info * devinfo)1168 update_l3_banks(struct intel_device_info *devinfo)
1169 {
1170    if (devinfo->ver != 12)
1171       return;
1172 
1173    if (devinfo->verx10 >= 125) {
1174       if (devinfo->subslice_total > 16) {
1175          assert(devinfo->subslice_total <= 32);
1176          devinfo->l3_banks = 32;
1177       } else if (devinfo->subslice_total > 8) {
1178          devinfo->l3_banks = 16;
1179       } else {
1180          devinfo->l3_banks = 8;
1181       }
1182    } else {
1183       assert(devinfo->num_slices == 1);
1184       if (devinfo->subslice_total >= 6) {
1185          assert(devinfo->subslice_total == 6);
1186          devinfo->l3_banks = 8;
1187       } else if (devinfo->subslice_total > 2) {
1188          devinfo->l3_banks = 6;
1189       } else {
1190          devinfo->l3_banks = 4;
1191       }
1192    }
1193 }
1194 
1195 /* At some point in time, some people decided to redefine what topology means,
1196  * from useful HW related information (slice, subslice, etc...), to much less
1197  * useful generic stuff that no one cares about (a single slice with lots of
1198  * subslices). Of course all of this was done without asking the people who
1199  * defined the topology query in the first place, to solve a lack of
1200  * information Gfx10+. This function is here to workaround the fact it's not
1201  * possible to change people's mind even before this stuff goes upstream. Sad
1202  * times...
1203  */
1204 static void
update_from_single_slice_topology(struct intel_device_info * devinfo,const struct drm_i915_query_topology_info * topology,const struct drm_i915_query_topology_info * geom_topology)1205 update_from_single_slice_topology(struct intel_device_info *devinfo,
1206                                   const struct drm_i915_query_topology_info *topology,
1207                                   const struct drm_i915_query_topology_info *geom_topology)
1208 {
1209    /* An array of bit masks of the subslices available for 3D
1210     * workloads, analogous to intel_device_info::subslice_masks.  This
1211     * may differ from the set of enabled subslices on XeHP+ platforms
1212     * with compute-only subslices.
1213     */
1214    uint8_t geom_subslice_masks[ARRAY_SIZE(devinfo->subslice_masks)] = { 0 };
1215 
1216    assert(devinfo->verx10 >= 125);
1217 
1218    reset_masks(devinfo);
1219 
1220    assert(topology->max_slices == 1);
1221    assert(topology->max_subslices > 0);
1222    assert(topology->max_eus_per_subslice > 0);
1223 
1224    /* i915 gives us only one slice so we have to rebuild that out of groups of
1225     * 4 dualsubslices.
1226     */
1227    devinfo->max_subslices_per_slice = 4;
1228    devinfo->max_eus_per_subslice = 16;
1229    devinfo->subslice_slice_stride = 1;
1230    devinfo->eu_slice_stride = DIV_ROUND_UP(16 * 4, 8);
1231    devinfo->eu_subslice_stride = DIV_ROUND_UP(16, 8);
1232 
1233    for (uint32_t ss_idx = 0; ss_idx < topology->max_subslices; ss_idx++) {
1234       const uint32_t s = ss_idx / 4;
1235       const uint32_t ss = ss_idx % 4;
1236 
1237       /* Determine whether ss_idx is enabled (ss_idx_available) and
1238        * available for 3D workloads (geom_ss_idx_available), which may
1239        * differ on XeHP+ if ss_idx is a compute-only DSS.
1240        */
1241       const bool ss_idx_available =
1242          (topology->data[topology->subslice_offset + ss_idx / 8] >>
1243           (ss_idx % 8)) & 1;
1244       const bool geom_ss_idx_available =
1245          (geom_topology->data[geom_topology->subslice_offset + ss_idx / 8] >>
1246           (ss_idx % 8)) & 1;
1247 
1248       if (geom_ss_idx_available) {
1249          assert(ss_idx_available);
1250          geom_subslice_masks[s * devinfo->subslice_slice_stride +
1251                              ss / 8] |= 1u << (ss % 8);
1252       }
1253 
1254       if (!ss_idx_available)
1255          continue;
1256 
1257       devinfo->max_slices = MAX2(devinfo->max_slices, s + 1);
1258       devinfo->slice_masks |= 1u << s;
1259 
1260       devinfo->subslice_masks[s * devinfo->subslice_slice_stride +
1261                               ss / 8] |= 1u << (ss % 8);
1262 
1263       for (uint32_t eu = 0; eu < devinfo->max_eus_per_subslice; eu++) {
1264          const bool eu_available =
1265             (topology->data[topology->eu_offset +
1266                             ss_idx * topology->eu_stride +
1267                             eu / 8] >> (eu % 8)) & 1;
1268 
1269          if (!eu_available)
1270             continue;
1271 
1272          devinfo->eu_masks[s * devinfo->eu_slice_stride +
1273                            ss * devinfo->eu_subslice_stride +
1274                            eu / 8] |= 1u << (eu % 8);
1275       }
1276    }
1277 
1278    update_slice_subslice_counts(devinfo);
1279    update_pixel_pipes(devinfo, geom_subslice_masks);
1280    update_l3_banks(devinfo);
1281 }
1282 
1283 static void
update_from_topology(struct intel_device_info * devinfo,const struct drm_i915_query_topology_info * topology)1284 update_from_topology(struct intel_device_info *devinfo,
1285                      const struct drm_i915_query_topology_info *topology)
1286 {
1287    reset_masks(devinfo);
1288 
1289    assert(topology->max_slices > 0);
1290    assert(topology->max_subslices > 0);
1291    assert(topology->max_eus_per_subslice > 0);
1292 
1293    devinfo->subslice_slice_stride = topology->subslice_stride;
1294 
1295    devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8);
1296    devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride;
1297 
1298    assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8));
1299    memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8));
1300    devinfo->max_slices = topology->max_slices;
1301    devinfo->max_subslices_per_slice = topology->max_subslices;
1302    devinfo->max_eus_per_subslice = topology->max_eus_per_subslice;
1303 
1304    uint32_t subslice_mask_len =
1305       topology->max_slices * topology->subslice_stride;
1306    assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len);
1307    memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset],
1308           subslice_mask_len);
1309 
1310    uint32_t eu_mask_len =
1311       topology->eu_stride * topology->max_subslices * topology->max_slices;
1312    assert(sizeof(devinfo->eu_masks) >= eu_mask_len);
1313    memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len);
1314 
1315    /* Now that all the masks are in place, update the counts. */
1316    update_slice_subslice_counts(devinfo);
1317    update_pixel_pipes(devinfo, devinfo->subslice_masks);
1318    update_l3_banks(devinfo);
1319 }
1320 
1321 /* Generate detailed mask from the I915_PARAM_SLICE_MASK,
1322  * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam.
1323  */
1324 static bool
update_from_masks(struct intel_device_info * devinfo,uint32_t slice_mask,uint32_t subslice_mask,uint32_t n_eus)1325 update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask,
1326                   uint32_t subslice_mask, uint32_t n_eus)
1327 {
1328    struct drm_i915_query_topology_info *topology;
1329 
1330    assert((slice_mask & 0xff) == slice_mask);
1331 
1332    size_t data_length = 100;
1333 
1334    topology = calloc(1, sizeof(*topology) + data_length);
1335    if (!topology)
1336       return false;
1337 
1338    topology->max_slices = util_last_bit(slice_mask);
1339    topology->max_subslices = util_last_bit(subslice_mask);
1340 
1341    topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8);
1342    topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8);
1343 
1344    uint32_t n_subslices = __builtin_popcount(slice_mask) *
1345       __builtin_popcount(subslice_mask);
1346    uint32_t max_eus_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1347    uint32_t eu_mask = (1U << max_eus_per_subslice) - 1;
1348 
1349    topology->max_eus_per_subslice = max_eus_per_subslice;
1350    topology->eu_offset = topology->subslice_offset +
1351       topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8);
1352    topology->eu_stride = DIV_ROUND_UP(max_eus_per_subslice, 8);
1353 
1354    /* Set slice mask in topology */
1355    for (int b = 0; b < topology->subslice_offset; b++)
1356       topology->data[b] = (slice_mask >> (b * 8)) & 0xff;
1357 
1358    for (int s = 0; s < topology->max_slices; s++) {
1359 
1360       /* Set subslice mask in topology */
1361       for (int b = 0; b < topology->subslice_stride; b++) {
1362          int subslice_offset = topology->subslice_offset +
1363             s * topology->subslice_stride + b;
1364 
1365          topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff;
1366       }
1367 
1368       /* Set eu mask in topology */
1369       for (int ss = 0; ss < topology->max_subslices; ss++) {
1370          for (int b = 0; b < topology->eu_stride; b++) {
1371             int eu_offset = topology->eu_offset +
1372                (s * topology->max_subslices + ss) * topology->eu_stride + b;
1373 
1374             topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff;
1375          }
1376       }
1377    }
1378 
1379    update_from_topology(devinfo, topology);
1380    free(topology);
1381 
1382    return true;
1383 }
1384 
1385 /* Generate mask from the device data. */
1386 static void
fill_masks(struct intel_device_info * devinfo)1387 fill_masks(struct intel_device_info *devinfo)
1388 {
1389    /* All of our internal device descriptions assign the same number of
1390     * subslices for each slice. Just verify that this is true.
1391     */
1392    for (int s = 1; s < devinfo->num_slices; s++)
1393       assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]);
1394 
1395    update_from_masks(devinfo,
1396                      (1U << devinfo->num_slices) - 1,
1397                      (1U << devinfo->num_subslices[0]) - 1,
1398                      devinfo->num_slices * devinfo->num_subslices[0] *
1399                      devinfo->max_eus_per_subslice);
1400 }
1401 
1402 static bool
getparam(int fd,uint32_t param,int * value)1403 getparam(int fd, uint32_t param, int *value)
1404 {
1405    int tmp;
1406 
1407    struct drm_i915_getparam gp = {
1408       .param = param,
1409       .value = &tmp,
1410    };
1411 
1412    int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
1413    if (ret != 0)
1414       return false;
1415 
1416    *value = tmp;
1417    return true;
1418 }
1419 
1420 static bool
get_context_param(int fd,uint32_t context,uint32_t param,uint64_t * value)1421 get_context_param(int fd, uint32_t context, uint32_t param, uint64_t *value)
1422 {
1423    struct drm_i915_gem_context_param gp = {
1424       .ctx_id = context,
1425       .param = param,
1426    };
1427 
1428    int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp);
1429    if (ret != 0)
1430       return false;
1431 
1432    *value = gp.value;
1433    return true;
1434 }
1435 
1436 static void
update_cs_workgroup_threads(struct intel_device_info * devinfo)1437 update_cs_workgroup_threads(struct intel_device_info *devinfo)
1438 {
1439    /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
1440     * can program is 64 without going up to a rectangular group. This only
1441     * impacts Haswell and TGL which have higher thread counts.
1442     *
1443     * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
1444     * is 10 bits so we have no such restrictions.
1445     */
1446    devinfo->max_cs_workgroup_threads =
1447       devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
1448                                MIN2(devinfo->max_cs_threads, 64);
1449 }
1450 
1451 bool
intel_get_device_info_from_pci_id(int pci_id,struct intel_device_info * devinfo)1452 intel_get_device_info_from_pci_id(int pci_id,
1453                                   struct intel_device_info *devinfo)
1454 {
1455    switch (pci_id) {
1456 #undef CHIPSET
1457 #define CHIPSET(id, family, fam_str, name) \
1458       case id: *devinfo = intel_device_info_##family; break;
1459 #include "pci_ids/crocus_pci_ids.h"
1460 #include "pci_ids/iris_pci_ids.h"
1461 
1462 #undef CHIPSET
1463 #define CHIPSET(id, fam_str, name) \
1464       case id: *devinfo = intel_device_info_gfx3; break;
1465 #include "pci_ids/i915_pci_ids.h"
1466 
1467    default:
1468       mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id);
1469       return false;
1470    }
1471 
1472    switch (pci_id) {
1473 #undef CHIPSET
1474 #define CHIPSET(_id, _family, _fam_str, _name) \
1475    case _id: \
1476       /* sizeof(str_literal) includes the null */ \
1477       STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \
1478                     sizeof(devinfo->name)); \
1479       strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \
1480       break;
1481 #include "pci_ids/crocus_pci_ids.h"
1482 #include "pci_ids/iris_pci_ids.h"
1483    default:
1484       strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name));
1485    }
1486 
1487    fill_masks(devinfo);
1488 
1489    /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
1490     *
1491     * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
1492     *  allocate scratch space enough so that each slice has 4 slices allowed."
1493     *
1494     * The equivalent internal documentation says that this programming note
1495     * applies to all Gfx9+ platforms.
1496     *
1497     * The hardware typically calculates the scratch space pointer by taking
1498     * the base address, and adding per-thread-scratch-space * thread ID.
1499     * Extra padding can be necessary depending how the thread IDs are
1500     * calculated for a particular shader stage.
1501     */
1502 
1503    switch(devinfo->ver) {
1504    case 9:
1505       devinfo->max_wm_threads = 64 /* threads-per-PSD */
1506                               * devinfo->num_slices
1507                               * 4; /* effective subslices per slice */
1508       break;
1509    case 11:
1510    case 12:
1511       devinfo->max_wm_threads = 128 /* threads-per-PSD */
1512                               * devinfo->num_slices
1513                               * 8; /* subslices per slice */
1514       break;
1515    default:
1516       assert(devinfo->ver < 9);
1517       break;
1518    }
1519 
1520    assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
1521 
1522    if (devinfo->verx10 == 0)
1523       devinfo->verx10 = devinfo->ver * 10;
1524 
1525    if (devinfo->display_ver == 0)
1526       devinfo->display_ver = devinfo->ver;
1527 
1528    update_cs_workgroup_threads(devinfo);
1529 
1530    return true;
1531 }
1532 
1533 /**
1534  * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology
1535  * (kernel 4.13+)
1536  */
1537 static bool
getparam_topology(struct intel_device_info * devinfo,int fd)1538 getparam_topology(struct intel_device_info *devinfo, int fd)
1539 {
1540    int slice_mask = 0;
1541    if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask))
1542       goto maybe_warn;
1543 
1544    int n_eus;
1545    if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus))
1546       goto maybe_warn;
1547 
1548    int subslice_mask = 0;
1549    if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask))
1550       goto maybe_warn;
1551 
1552    return update_from_masks(devinfo, slice_mask, subslice_mask, n_eus);
1553 
1554  maybe_warn:
1555    /* Only with Gfx8+ are we starting to see devices with fusing that can only
1556     * be detected at runtime.
1557     */
1558    if (devinfo->ver >= 8)
1559       mesa_logw("Kernel 4.1 required to properly query GPU properties.");
1560 
1561    return false;
1562 }
1563 
1564 /**
1565  * preferred API for updating the topology in devinfo (kernel 4.17+)
1566  */
1567 static bool
query_topology(struct intel_device_info * devinfo,int fd)1568 query_topology(struct intel_device_info *devinfo, int fd)
1569 {
1570    struct drm_i915_query_topology_info *topo_info =
1571       intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO, NULL);
1572    if (topo_info == NULL)
1573       return false;
1574 
1575    if (devinfo->verx10 >= 125) {
1576       struct drm_i915_query_topology_info *geom_topo_info =
1577          intel_i915_query_alloc(fd, DRM_I915_QUERY_GEOMETRY_SUBSLICES, NULL);
1578       if (geom_topo_info == NULL) {
1579          free(topo_info);
1580          return false;
1581       }
1582 
1583       update_from_single_slice_topology(devinfo, topo_info, geom_topo_info);
1584       free(geom_topo_info);
1585    } else {
1586       update_from_topology(devinfo, topo_info);
1587    }
1588 
1589    free(topo_info);
1590 
1591    return true;
1592 
1593 }
1594 
1595 /**
1596  * Reports memory region info, and allows buffers to target system-memory,
1597  * and/or device local memory.
1598  */
1599 static bool
query_regions(struct intel_device_info * devinfo,int fd,bool update)1600 query_regions(struct intel_device_info *devinfo, int fd, bool update)
1601 {
1602    struct drm_i915_query_memory_regions *meminfo =
1603       intel_i915_query_alloc(fd, DRM_I915_QUERY_MEMORY_REGIONS, NULL);
1604    if (meminfo == NULL)
1605       return false;
1606 
1607    for (int i = 0; i < meminfo->num_regions; i++) {
1608       const struct drm_i915_memory_region_info *mem = &meminfo->regions[i];
1609       switch (mem->region.memory_class) {
1610       case I915_MEMORY_CLASS_SYSTEM: {
1611          if (!update) {
1612             devinfo->mem.sram.mem_class = mem->region.memory_class;
1613             devinfo->mem.sram.mem_instance = mem->region.memory_instance;
1614             devinfo->mem.sram.mappable.size = mem->probed_size;
1615          } else {
1616             assert(devinfo->mem.sram.mem_class == mem->region.memory_class);
1617             assert(devinfo->mem.sram.mem_instance == mem->region.memory_instance);
1618             assert(devinfo->mem.sram.mappable.size == mem->probed_size);
1619          }
1620          /* The kernel uAPI only reports an accurate unallocated_size value
1621           * for I915_MEMORY_CLASS_DEVICE.
1622           */
1623          uint64_t available;
1624          if (os_get_available_system_memory(&available))
1625             devinfo->mem.sram.mappable.free = MIN2(available, mem->probed_size);
1626          break;
1627       }
1628       case I915_MEMORY_CLASS_DEVICE:
1629          if (!update) {
1630             devinfo->mem.vram.mem_class = mem->region.memory_class;
1631             devinfo->mem.vram.mem_instance = mem->region.memory_instance;
1632             if (mem->probed_cpu_visible_size > 0) {
1633                devinfo->mem.vram.mappable.size = mem->probed_cpu_visible_size;
1634                devinfo->mem.vram.unmappable.size =
1635                   mem->probed_size - mem->probed_cpu_visible_size;
1636             } else {
1637                /* We are running on an older kernel without support for the
1638                 * small-bar uapi. These kernels only support systems where the
1639                 * entire vram is mappable.
1640                 */
1641                devinfo->mem.vram.mappable.size = mem->probed_size;
1642                devinfo->mem.vram.unmappable.size = 0;
1643             }
1644          } else {
1645             assert(devinfo->mem.vram.mem_class == mem->region.memory_class);
1646             assert(devinfo->mem.vram.mem_instance == mem->region.memory_instance);
1647             assert((devinfo->mem.vram.mappable.size +
1648                     devinfo->mem.vram.unmappable.size) == mem->probed_size);
1649          }
1650          if (mem->unallocated_cpu_visible_size > 0) {
1651             if (mem->unallocated_size != -1) {
1652                devinfo->mem.vram.mappable.free = mem->unallocated_cpu_visible_size;
1653                devinfo->mem.vram.unmappable.free =
1654                   mem->unallocated_size - mem->unallocated_cpu_visible_size;
1655             }
1656          } else {
1657             /* We are running on an older kernel without support for the
1658              * small-bar uapi. These kernels only support systems where the
1659              * entire vram is mappable.
1660              */
1661             if (mem->unallocated_size != -1) {
1662                devinfo->mem.vram.mappable.free = mem->unallocated_size;
1663                devinfo->mem.vram.unmappable.free = 0;
1664             }
1665          }
1666          break;
1667       default:
1668          break;
1669       }
1670    }
1671 
1672    free(meminfo);
1673    devinfo->mem.use_class_instance = true;
1674    return true;
1675 }
1676 
1677 static bool
compute_system_memory(struct intel_device_info * devinfo,bool update)1678 compute_system_memory(struct intel_device_info *devinfo, bool update)
1679 {
1680    uint64_t total_phys;
1681    if (!os_get_total_physical_memory(&total_phys))
1682       return false;
1683 
1684    uint64_t available = 0;
1685    os_get_available_system_memory(&available);
1686 
1687    if (!update)
1688       devinfo->mem.sram.mappable.size = total_phys;
1689    else
1690       assert(devinfo->mem.sram.mappable.size == total_phys);
1691 
1692    devinfo->mem.sram.mappable.free = available;
1693 
1694    return true;
1695 }
1696 
1697 static int
intel_get_aperture_size(int fd,uint64_t * size)1698 intel_get_aperture_size(int fd, uint64_t *size)
1699 {
1700    struct drm_i915_gem_get_aperture aperture = { 0 };
1701 
1702    int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
1703    if (ret == 0 && size)
1704       *size = aperture.aper_size;
1705 
1706    return ret;
1707 }
1708 
1709 static bool
has_bit6_swizzle(int fd)1710 has_bit6_swizzle(int fd)
1711 {
1712    struct drm_gem_close close;
1713    int ret;
1714 
1715    struct drm_i915_gem_create gem_create = {
1716       .size = 4096,
1717    };
1718 
1719    if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1720       unreachable("Failed to create GEM BO");
1721       return false;
1722    }
1723 
1724    bool swizzled = false;
1725 
1726    /* set_tiling overwrites the input on the error path, so we have to open
1727     * code intel_ioctl.
1728     */
1729    do {
1730       struct drm_i915_gem_set_tiling set_tiling = {
1731          .handle = gem_create.handle,
1732          .tiling_mode = I915_TILING_X,
1733          .stride = 512,
1734       };
1735 
1736       ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
1737    } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
1738 
1739    if (ret != 0) {
1740       unreachable("Failed to set BO tiling");
1741       goto close_and_return;
1742    }
1743 
1744    struct drm_i915_gem_get_tiling get_tiling = {
1745       .handle = gem_create.handle,
1746    };
1747 
1748    if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
1749       unreachable("Failed to get BO tiling");
1750       goto close_and_return;
1751    }
1752 
1753    assert(get_tiling.tiling_mode == I915_TILING_X);
1754    swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE;
1755 
1756 close_and_return:
1757    memset(&close, 0, sizeof(close));
1758    close.handle = gem_create.handle;
1759    intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1760 
1761    return swizzled;
1762 }
1763 
1764 static bool
has_get_tiling(int fd)1765 has_get_tiling(int fd)
1766 {
1767    int ret;
1768 
1769    struct drm_i915_gem_create gem_create = {
1770       .size = 4096,
1771    };
1772 
1773    if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1774       unreachable("Failed to create GEM BO");
1775       return false;
1776    }
1777 
1778    struct drm_i915_gem_get_tiling get_tiling = {
1779       .handle = gem_create.handle,
1780    };
1781    ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &get_tiling);
1782 
1783    struct drm_gem_close close = {
1784       .handle = gem_create.handle,
1785    };
1786    intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1787 
1788    return ret == 0;
1789 }
1790 
1791 static void
fixup_chv_device_info(struct intel_device_info * devinfo)1792 fixup_chv_device_info(struct intel_device_info *devinfo)
1793 {
1794    assert(devinfo->platform == INTEL_PLATFORM_CHV);
1795 
1796    /* Cherryview is annoying.  The number of EUs is depending on fusing and
1797     * isn't determinable from the PCI ID alone.  We default to the minimum
1798     * available for that PCI ID and then compute the real value from the
1799     * subslice information we get from the kernel.
1800     */
1801    const uint32_t subslice_total = intel_device_info_subslice_total(devinfo);
1802    const uint32_t eu_total = intel_device_info_eu_total(devinfo);
1803 
1804    /* Logical CS threads = EUs per subslice * num threads per EU */
1805    uint32_t max_cs_threads =
1806       eu_total / subslice_total * devinfo->num_thread_per_eu;
1807 
1808    /* Fuse configurations may give more threads than expected, never less. */
1809    if (max_cs_threads > devinfo->max_cs_threads)
1810       devinfo->max_cs_threads = max_cs_threads;
1811 
1812    update_cs_workgroup_threads(devinfo);
1813 
1814    /* Braswell is even more annoying.  Its marketing name isn't determinable
1815     * from the PCI ID and is also dependent on fusing.
1816     */
1817    if (devinfo->pci_device_id != 0x22B1)
1818       return;
1819 
1820    char *bsw_model;
1821    switch (eu_total) {
1822    case 16: bsw_model = "405"; break;
1823    case 12: bsw_model = "400"; break;
1824    default: bsw_model = "   "; break;
1825    }
1826 
1827    char *needle = strstr(devinfo->name, "XXX");
1828    assert(needle);
1829    if (needle)
1830       memcpy(needle, bsw_model, 3);
1831 }
1832 
1833 static void
init_max_scratch_ids(struct intel_device_info * devinfo)1834 init_max_scratch_ids(struct intel_device_info *devinfo)
1835 {
1836    /* Determine the max number of subslices that potentially might be used in
1837     * scratch space ids.
1838     *
1839     * For, Gfx11+, scratch space allocation is based on the number of threads
1840     * in the base configuration.
1841     *
1842     * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
1843     * we wish to view that there are 4 subslices per slice instead of the
1844     * actual number of subslices per slice. The documentation for 3DSTATE_PS
1845     * "Scratch Space Base Pointer" says:
1846     *
1847     *    "Scratch Space per slice is computed based on 4 sub-slices.  SW
1848     *     must allocate scratch space enough so that each slice has 4
1849     *     slices allowed."
1850     *
1851     * According to the other driver team, this applies to compute shaders
1852     * as well.  This is not currently documented at all.
1853     *
1854     * For Gfx8 and older we user devinfo->subslice_total.
1855     */
1856    unsigned subslices;
1857    if (devinfo->verx10 == 125)
1858       subslices = 32;
1859    else if (devinfo->ver == 12)
1860       subslices = (devinfo->platform == INTEL_PLATFORM_DG1 || devinfo->gt == 2 ? 6 : 2);
1861    else if (devinfo->ver == 11)
1862       subslices = 8;
1863    else if (devinfo->ver >= 9 && devinfo->ver < 11)
1864       subslices = 4 * devinfo->num_slices;
1865    else
1866       subslices = devinfo->subslice_total;
1867    assert(subslices >= devinfo->subslice_total);
1868 
1869    unsigned scratch_ids_per_subslice;
1870    if (devinfo->ver >= 12) {
1871       /* Same as ICL below, but with 16 EUs. */
1872       scratch_ids_per_subslice = 16 * 8;
1873    } else if (devinfo->ver >= 11) {
1874       /* The MEDIA_VFE_STATE docs say:
1875        *
1876        *    "Starting with this configuration, the Maximum Number of
1877        *     Threads must be set to (#EU * 8) for GPGPU dispatches.
1878        *
1879        *     Although there are only 7 threads per EU in the configuration,
1880        *     the FFTID is calculated as if there are 8 threads per EU,
1881        *     which in turn requires a larger amount of Scratch Space to be
1882        *     allocated by the driver."
1883        */
1884       scratch_ids_per_subslice = 8 * 8;
1885    } else if (devinfo->platform == INTEL_PLATFORM_HSW) {
1886       /* WaCSScratchSize:hsw
1887        *
1888        * Haswell's scratch space address calculation appears to be sparse
1889        * rather than tightly packed. The Thread ID has bits indicating
1890        * which subslice, EU within a subslice, and thread within an EU it
1891        * is. There's a maximum of two slices and two subslices, so these
1892        * can be stored with a single bit. Even though there are only 10 EUs
1893        * per subslice, this is stored in 4 bits, so there's an effective
1894        * maximum value of 16 EUs. Similarly, although there are only 7
1895        * threads per EU, this is stored in a 3 bit number, giving an
1896        * effective maximum value of 8 threads per EU.
1897        *
1898        * This means that we need to use 16 * 8 instead of 10 * 7 for the
1899        * number of threads per subslice.
1900        */
1901       scratch_ids_per_subslice = 16 * 8;
1902    } else if (devinfo->platform == INTEL_PLATFORM_CHV) {
1903       /* Cherryview devices have either 6 or 8 EUs per subslice, and each
1904        * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
1905        * as if it had 8 EUs.
1906        */
1907       scratch_ids_per_subslice = 8 * 7;
1908    } else {
1909       scratch_ids_per_subslice = devinfo->max_cs_threads;
1910    }
1911 
1912    unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
1913 
1914    if (devinfo->verx10 >= 125) {
1915       /* On GFX version 12.5, scratch access changed to a surface-based model.
1916        * Instead of each shader type having its own layout based on IDs passed
1917        * from the relevant fixed-function unit, all scratch access is based on
1918        * thread IDs like it always has been for compute.
1919        */
1920       for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
1921          devinfo->max_scratch_ids[i] = max_thread_ids;
1922    } else {
1923       unsigned max_scratch_ids[] = {
1924          [MESA_SHADER_VERTEX]    = devinfo->max_vs_threads,
1925          [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
1926          [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
1927          [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
1928          [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
1929          [MESA_SHADER_COMPUTE]   = max_thread_ids,
1930       };
1931       STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
1932       memcpy(devinfo->max_scratch_ids, max_scratch_ids,
1933              sizeof(devinfo->max_scratch_ids));
1934    }
1935 }
1936 
1937 bool
intel_get_device_info_from_fd(int fd,struct intel_device_info * devinfo)1938 intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
1939 {
1940    /* Get PCI info.
1941     *
1942     * Some callers may already have a valid drm device which holds values of
1943     * PCI fields queried here prior to calling this function. But making this
1944     * query optional leads to a more cumbersome implementation. These callers
1945     * still need to initialize the fields somewhere out of this function and
1946     * rely on an ioctl to get PCI device id for the next step when skipping
1947     * this drm query.
1948     */
1949    drmDevicePtr drmdev = NULL;
1950    if (drmGetDevice2(fd, DRM_DEVICE_GET_PCI_REVISION, &drmdev)) {
1951       mesa_loge("Failed to query drm device.");
1952       return false;
1953    }
1954    if (!intel_get_device_info_from_pci_id
1955        (drmdev->deviceinfo.pci->device_id, devinfo)) {
1956       drmFreeDevice(&drmdev);
1957       return false;
1958    }
1959    devinfo->pci_domain = drmdev->businfo.pci->domain;
1960    devinfo->pci_bus = drmdev->businfo.pci->bus;
1961    devinfo->pci_dev = drmdev->businfo.pci->dev;
1962    devinfo->pci_func = drmdev->businfo.pci->func;
1963    devinfo->pci_device_id = drmdev->deviceinfo.pci->device_id;
1964    devinfo->pci_revision_id = drmdev->deviceinfo.pci->revision_id;
1965    drmFreeDevice(&drmdev);
1966    devinfo->no_hw = env_var_as_boolean("INTEL_NO_HW", false);
1967 
1968    if (devinfo->ver == 10) {
1969       mesa_loge("Gfx10 support is redacted.");
1970       return false;
1971    }
1972 
1973    /* remaining initializion queries the kernel for device info */
1974    if (devinfo->no_hw) {
1975       /* Provide some sensible values for NO_HW. */
1976       devinfo->gtt_size =
1977          devinfo->ver >= 8 ? (1ull << 48) : 2ull * 1024 * 1024 * 1024;
1978       compute_system_memory(devinfo, false);
1979       return true;
1980    }
1981 
1982    if (intel_get_and_process_hwconfig_table(fd, devinfo)) {
1983       /* After applying hwconfig values, some items need to be recalculated. */
1984       devinfo->max_cs_threads =
1985          devinfo->max_eus_per_subslice * devinfo->num_thread_per_eu;
1986 
1987       update_cs_workgroup_threads(devinfo);
1988    }
1989 
1990    int timestamp_frequency;
1991    if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY,
1992                 &timestamp_frequency))
1993       devinfo->timestamp_frequency = timestamp_frequency;
1994    else if (devinfo->ver >= 10) {
1995       mesa_loge("Kernel 4.15 required to read the CS timestamp frequency.");
1996       return false;
1997    }
1998 
1999    if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision))
2000       devinfo->revision = 0;
2001 
2002    if (!query_topology(devinfo, fd)) {
2003       if (devinfo->ver >= 10) {
2004          /* topology uAPI required for CNL+ (kernel 4.17+) */
2005          return false;
2006       }
2007 
2008       /* else use the kernel 4.13+ api for gfx8+.  For older kernels, topology
2009        * will be wrong, affecting GPU metrics. In this case, fail silently.
2010        */
2011       getparam_topology(devinfo, fd);
2012    }
2013 
2014    /* If the memory region uAPI query is not available, try to generate some
2015     * numbers out of os_* utils for sram only.
2016     */
2017    if (!query_regions(devinfo, fd, false))
2018       compute_system_memory(devinfo, false);
2019 
2020    /* region info is required for lmem support */
2021    if (devinfo->has_local_mem && !devinfo->mem.use_class_instance) {
2022       mesa_logw("Could not query local memory size.");
2023       return false;
2024    }
2025 
2026    if (devinfo->platform == INTEL_PLATFORM_CHV)
2027       fixup_chv_device_info(devinfo);
2028 
2029    /* Broadwell PRM says:
2030     *
2031     *   "Before Gfx8, there was a historical configuration control field to
2032     *    swizzle address bit[6] for in X/Y tiling modes. This was set in three
2033     *    different places: TILECTL[1:0], ARB_MODE[5:4], and
2034     *    DISP_ARB_CTL[14:13].
2035     *
2036     *    For Gfx8 and subsequent generations, the swizzle fields are all
2037     *    reserved, and the CPU's memory controller performs all address
2038     *    swizzling modifications."
2039     */
2040    devinfo->has_bit6_swizzle = devinfo->ver < 8 && has_bit6_swizzle(fd);
2041 
2042    intel_get_aperture_size(fd, &devinfo->aperture_bytes);
2043    get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE, &devinfo->gtt_size);
2044    devinfo->has_tiling_uapi = has_get_tiling(fd);
2045 
2046    /* Gfx7 and older do not support EU/Subslice info */
2047    assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
2048    devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
2049 
2050    init_max_scratch_ids(devinfo);
2051 
2052    return true;
2053 }
2054 
intel_device_info_update_memory_info(struct intel_device_info * devinfo,int fd)2055 bool intel_device_info_update_memory_info(struct intel_device_info *devinfo, int fd)
2056 {
2057    return query_regions(devinfo, fd, true) || compute_system_memory(devinfo, true);
2058 }
2059