1 /*
2 * Copyright © 2017 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_gpu_info.h"
8 #include "ac_shader_util.h"
9 #include "ac_debug.h"
10 #include "ac_surface.h"
11
12 #include "addrlib/src/amdgpu_asic_addr.h"
13 #include "sid.h"
14 #include "util/macros.h"
15 #include "util/u_cpu_detect.h"
16 #include "util/u_math.h"
17 #include "util/os_misc.h"
18 #include "util/bitset.h"
19
20 #include <stdio.h>
21 #include <ctype.h>
22
23 #define AMDGPU_MI100_RANGE 0x32, 0x3C
24 #define AMDGPU_MI200_RANGE 0x3C, 0xFF
25 #define AMDGPU_GFX940_RANGE 0x46, 0xFF
26
27 #define ASICREV_IS_MI100(r) ASICREV_IS(r, MI100)
28 #define ASICREV_IS_MI200(r) ASICREV_IS(r, MI200)
29 #define ASICREV_IS_GFX940(r) ASICREV_IS(r, GFX940)
30
31 #ifdef _WIN32
32 #define DRM_CAP_ADDFB2_MODIFIERS 0x10
33 #define DRM_CAP_SYNCOBJ 0x13
34 #define DRM_CAP_SYNCOBJ_TIMELINE 0x14
35 #define AMDGPU_GEM_DOMAIN_GTT 0x2
36 #define AMDGPU_GEM_DOMAIN_VRAM 0x4
37 #define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0)
38 #define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10)
39 #define AMDGPU_HW_IP_GFX 0
40 #define AMDGPU_HW_IP_COMPUTE 1
41 #define AMDGPU_HW_IP_DMA 2
42 #define AMDGPU_HW_IP_UVD 3
43 #define AMDGPU_HW_IP_VCE 4
44 #define AMDGPU_HW_IP_UVD_ENC 5
45 #define AMDGPU_HW_IP_VCN_DEC 6
46 #define AMDGPU_HW_IP_VCN_ENC 7
47 #define AMDGPU_HW_IP_VCN_JPEG 8
48 #define AMDGPU_HW_IP_VPE 9
49 #define AMDGPU_IDS_FLAGS_FUSION 0x1
50 #define AMDGPU_IDS_FLAGS_PREEMPTION 0x2
51 #define AMDGPU_IDS_FLAGS_TMZ 0x4
52 #define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8
53 #define AMDGPU_INFO_FW_VCE 0x1
54 #define AMDGPU_INFO_FW_UVD 0x2
55 #define AMDGPU_INFO_FW_GFX_ME 0x04
56 #define AMDGPU_INFO_FW_GFX_PFP 0x05
57 #define AMDGPU_INFO_FW_GFX_CE 0x06
58 #define AMDGPU_INFO_DEV_INFO 0x16
59 #define AMDGPU_INFO_MEMORY 0x19
60 #define AMDGPU_INFO_VIDEO_CAPS_DECODE 0
61 #define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1
62 #define AMDGPU_INFO_FW_GFX_MEC 0x08
63 #define AMDGPU_INFO_MAX_IBS 0x22
64
65 #define AMDGPU_VRAM_TYPE_UNKNOWN 0
66 #define AMDGPU_VRAM_TYPE_GDDR1 1
67 #define AMDGPU_VRAM_TYPE_DDR2 2
68 #define AMDGPU_VRAM_TYPE_GDDR3 3
69 #define AMDGPU_VRAM_TYPE_GDDR4 4
70 #define AMDGPU_VRAM_TYPE_GDDR5 5
71 #define AMDGPU_VRAM_TYPE_HBM 6
72 #define AMDGPU_VRAM_TYPE_DDR3 7
73 #define AMDGPU_VRAM_TYPE_DDR4 8
74 #define AMDGPU_VRAM_TYPE_GDDR6 9
75 #define AMDGPU_VRAM_TYPE_DDR5 10
76 #define AMDGPU_VRAM_TYPE_LPDDR4 11
77 #define AMDGPU_VRAM_TYPE_LPDDR5 12
78
79 struct drm_amdgpu_heap_info {
80 uint64_t total_heap_size;
81 };
82 struct drm_amdgpu_memory_info {
83 struct drm_amdgpu_heap_info vram;
84 struct drm_amdgpu_heap_info cpu_accessible_vram;
85 struct drm_amdgpu_heap_info gtt;
86 };
87 struct drm_amdgpu_info_device {
88 /** PCI Device ID */
89 uint32_t device_id;
90 /** Internal chip revision: A0, A1, etc.) */
91 uint32_t chip_rev;
92 uint32_t external_rev;
93 /** Revision id in PCI Config space */
94 uint32_t pci_rev;
95 uint32_t family;
96 uint32_t num_shader_engines;
97 uint32_t num_shader_arrays_per_engine;
98 /* in KHz */
99 uint32_t gpu_counter_freq;
100 uint64_t max_engine_clock;
101 uint64_t max_memory_clock;
102 /* cu information */
103 uint32_t cu_active_number;
104 /* NOTE: cu_ao_mask is INVALID, DON'T use it */
105 uint32_t cu_ao_mask;
106 uint32_t cu_bitmap[4][4];
107 /** Render backend pipe mask. One render backend is CB+DB. */
108 uint32_t enabled_rb_pipes_mask;
109 uint32_t num_rb_pipes;
110 uint32_t num_hw_gfx_contexts;
111 /* PCIe version (the smaller of the GPU and the CPU/motherboard) */
112 uint32_t pcie_gen;
113 uint64_t ids_flags;
114 /** Starting virtual address for UMDs. */
115 uint64_t virtual_address_offset;
116 /** The maximum virtual address */
117 uint64_t virtual_address_max;
118 /** Required alignment of virtual addresses. */
119 uint32_t virtual_address_alignment;
120 /** Page table entry - fragment size */
121 uint32_t pte_fragment_size;
122 uint32_t gart_page_size;
123 /** constant engine ram size*/
124 uint32_t ce_ram_size;
125 /** video memory type info*/
126 uint32_t vram_type;
127 /** video memory bit width*/
128 uint32_t vram_bit_width;
129 /* vce harvesting instance */
130 uint32_t vce_harvest_config;
131 /* gfx double offchip LDS buffers */
132 uint32_t gc_double_offchip_lds_buf;
133 /* NGG Primitive Buffer */
134 uint64_t prim_buf_gpu_addr;
135 /* NGG Position Buffer */
136 uint64_t pos_buf_gpu_addr;
137 /* NGG Control Sideband */
138 uint64_t cntl_sb_buf_gpu_addr;
139 /* NGG Parameter Cache */
140 uint64_t param_buf_gpu_addr;
141 uint32_t prim_buf_size;
142 uint32_t pos_buf_size;
143 uint32_t cntl_sb_buf_size;
144 uint32_t param_buf_size;
145 /* wavefront size*/
146 uint32_t wave_front_size;
147 /* shader visible vgprs*/
148 uint32_t num_shader_visible_vgprs;
149 /* CU per shader array*/
150 uint32_t num_cu_per_sh;
151 /* number of tcc blocks*/
152 uint32_t num_tcc_blocks;
153 /* gs vgt table depth*/
154 uint32_t gs_vgt_table_depth;
155 /* gs primitive buffer depth*/
156 uint32_t gs_prim_buffer_depth;
157 /* max gs wavefront per vgt*/
158 uint32_t max_gs_waves_per_vgt;
159 /* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */
160 uint32_t pcie_num_lanes;
161 /* always on cu bitmap */
162 uint32_t cu_ao_bitmap[4][4];
163 /** Starting high virtual address for UMDs. */
164 uint64_t high_va_offset;
165 /** The maximum high virtual address */
166 uint64_t high_va_max;
167 /* gfx10 pa_sc_tile_steering_override */
168 uint32_t pa_sc_tile_steering_override;
169 /* disabled TCCs */
170 uint64_t tcc_disabled_mask;
171 uint64_t min_engine_clock;
172 uint64_t min_memory_clock;
173 /* The following fields are only set on gfx11+, older chips set 0. */
174 uint32_t tcp_cache_size; /* AKA GL0, VMEM cache */
175 uint32_t num_sqc_per_wgp;
176 uint32_t sqc_data_cache_size; /* AKA SMEM cache */
177 uint32_t sqc_inst_cache_size;
178 uint32_t gl1c_cache_size;
179 uint32_t gl2c_cache_size;
180 uint64_t mall_size; /* AKA infinity cache */
181 /* high 32 bits of the rb pipes mask */
182 uint32_t enabled_rb_pipes_mask_hi;
183 /* shadow area size for gfx11 */
184 uint32_t shadow_size;
185 /* shadow area base virtual alignment for gfx11 */
186 uint32_t shadow_alignment;
187 /* context save area size for gfx11 */
188 uint32_t csa_size;
189 /* context save area base virtual alignment for gfx11 */
190 uint32_t csa_alignment;
191 };
192 struct drm_amdgpu_info_hw_ip {
193 uint32_t hw_ip_version_major;
194 uint32_t hw_ip_version_minor;
195 uint32_t ib_start_alignment;
196 uint32_t ib_size_alignment;
197 uint32_t available_rings;
198 uint32_t ip_discovery_version;
199 };
200 typedef struct _drmPciBusInfo {
201 uint16_t domain;
202 uint8_t bus;
203 uint8_t dev;
204 uint8_t func;
205 } drmPciBusInfo, *drmPciBusInfoPtr;
206 typedef struct _drmDevice {
207 union {
208 drmPciBusInfoPtr pci;
209 } businfo;
210 } drmDevice, *drmDevicePtr;
211 enum amdgpu_sw_info {
212 amdgpu_sw_info_address32_hi = 0,
213 };
214 typedef struct amdgpu_device *amdgpu_device_handle;
215 typedef struct amdgpu_bo *amdgpu_bo_handle;
216 struct amdgpu_bo_alloc_request {
217 uint64_t alloc_size;
218 uint64_t phys_alignment;
219 uint32_t preferred_heap;
220 uint64_t flags;
221 };
222 struct amdgpu_gds_resource_info {
223 uint32_t gds_gfx_partition_size;
224 uint32_t gds_total_size;
225 };
226 struct amdgpu_buffer_size_alignments {
227 uint64_t size_local;
228 uint64_t size_remote;
229 };
230 struct amdgpu_heap_info {
231 uint64_t heap_size;
232 };
233 struct amdgpu_gpu_info {
234 uint32_t asic_id;
235 uint32_t chip_external_rev;
236 uint32_t family_id;
237 uint64_t ids_flags;
238 uint64_t max_engine_clk;
239 uint64_t max_memory_clk;
240 uint32_t num_shader_engines;
241 uint32_t num_shader_arrays_per_engine;
242 uint32_t rb_pipes;
243 uint32_t enabled_rb_pipes_mask;
244 uint32_t gpu_counter_freq;
245 uint32_t mc_arb_ramcfg;
246 uint32_t gb_addr_cfg;
247 uint32_t gb_tile_mode[32];
248 uint32_t gb_macro_tile_mode[16];
249 uint32_t cu_bitmap[4][4];
250 uint32_t vram_type;
251 uint32_t vram_bit_width;
252 uint32_t ce_ram_size;
253 uint32_t vce_harvest_config;
254 uint32_t pci_rev_id;
255 };
drmGetCap(int fd,uint64_t capability,uint64_t * value)256 static int drmGetCap(int fd, uint64_t capability, uint64_t *value)
257 {
258 return -EINVAL;
259 }
drmFreeDevice(drmDevicePtr * device)260 static void drmFreeDevice(drmDevicePtr *device)
261 {
262 }
drmGetDevice2(int fd,uint32_t flags,drmDevicePtr * device)263 static int drmGetDevice2(int fd, uint32_t flags, drmDevicePtr *device)
264 {
265 return -ENODEV;
266 }
amdgpu_bo_alloc(amdgpu_device_handle dev,struct amdgpu_bo_alloc_request * alloc_buffer,amdgpu_bo_handle * buf_handle)267 static int amdgpu_bo_alloc(amdgpu_device_handle dev,
268 struct amdgpu_bo_alloc_request *alloc_buffer,
269 amdgpu_bo_handle *buf_handle)
270 {
271 return -EINVAL;
272 }
amdgpu_bo_free(amdgpu_bo_handle buf_handle)273 static int amdgpu_bo_free(amdgpu_bo_handle buf_handle)
274 {
275 return -EINVAL;
276 }
amdgpu_query_buffer_size_alignment(amdgpu_device_handle dev,struct amdgpu_buffer_size_alignments * info)277 static int amdgpu_query_buffer_size_alignment(amdgpu_device_handle dev,
278 struct amdgpu_buffer_size_alignments
279 *info)
280 {
281 return -EINVAL;
282 }
amdgpu_query_firmware_version(amdgpu_device_handle dev,unsigned fw_type,unsigned ip_instance,unsigned index,uint32_t * version,uint32_t * feature)283 static int amdgpu_query_firmware_version(amdgpu_device_handle dev, unsigned fw_type,
284 unsigned ip_instance, unsigned index,
285 uint32_t *version, uint32_t *feature)
286 {
287 return -EINVAL;
288 }
amdgpu_query_hw_ip_info(amdgpu_device_handle dev,unsigned type,unsigned ip_instance,struct drm_amdgpu_info_hw_ip * info)289 static int amdgpu_query_hw_ip_info(amdgpu_device_handle dev, unsigned type,
290 unsigned ip_instance,
291 struct drm_amdgpu_info_hw_ip *info)
292 {
293 return -EINVAL;
294 }
amdgpu_query_heap_info(amdgpu_device_handle dev,uint32_t heap,uint32_t flags,struct amdgpu_heap_info * info)295 static int amdgpu_query_heap_info(amdgpu_device_handle dev, uint32_t heap,
296 uint32_t flags, struct amdgpu_heap_info *info)
297 {
298 return -EINVAL;
299 }
amdgpu_query_gpu_info(amdgpu_device_handle dev,struct amdgpu_gpu_info * info)300 static int amdgpu_query_gpu_info(amdgpu_device_handle dev,
301 struct amdgpu_gpu_info *info)
302 {
303 return -EINVAL;
304 }
amdgpu_query_info(amdgpu_device_handle dev,unsigned info_id,unsigned size,void * value)305 static int amdgpu_query_info(amdgpu_device_handle dev, unsigned info_id,
306 unsigned size, void *value)
307 {
308 return -EINVAL;
309 }
amdgpu_query_sw_info(amdgpu_device_handle dev,enum amdgpu_sw_info info,void * value)310 static int amdgpu_query_sw_info(amdgpu_device_handle dev, enum amdgpu_sw_info info,
311 void *value)
312 {
313 return -EINVAL;
314 }
amdgpu_query_gds_info(amdgpu_device_handle dev,struct amdgpu_gds_resource_info * gds_info)315 static int amdgpu_query_gds_info(amdgpu_device_handle dev,
316 struct amdgpu_gds_resource_info *gds_info)
317 {
318 return -EINVAL;
319 }
amdgpu_query_video_caps_info(amdgpu_device_handle dev,unsigned cap_type,unsigned size,void * value)320 static int amdgpu_query_video_caps_info(amdgpu_device_handle dev, unsigned cap_type,
321 unsigned size, void *value)
322 {
323 return -EINVAL;
324 }
amdgpu_get_marketing_name(amdgpu_device_handle dev)325 static const char *amdgpu_get_marketing_name(amdgpu_device_handle dev)
326 {
327 return NULL;
328 }
readlink(const char * path,char * buf,size_t bufsiz)329 static intptr_t readlink(const char *path, char *buf, size_t bufsiz)
330 {
331 return -1;
332 }
333 #else
334 #include "drm-uapi/amdgpu_drm.h"
335 #include <amdgpu.h>
336 #include <xf86drm.h>
337 #include <unistd.h>
338 #endif
339
340 #define CIK_TILE_MODE_COLOR_2D 14
341
has_timeline_syncobj(int fd)342 static bool has_timeline_syncobj(int fd)
343 {
344 uint64_t value;
345 if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value))
346 return false;
347 return value ? true : false;
348 }
349
has_modifiers(int fd)350 static bool has_modifiers(int fd)
351 {
352 uint64_t value;
353 if (drmGetCap(fd, DRM_CAP_ADDFB2_MODIFIERS, &value))
354 return false;
355 return value ? true : false;
356 }
357
fix_vram_size(uint64_t size)358 static uint64_t fix_vram_size(uint64_t size)
359 {
360 /* The VRAM size is underreported, so we need to fix it, because
361 * it's used to compute the number of memory modules for harvesting.
362 */
363 return align64(size, 256 * 1024 * 1024);
364 }
365
366 static bool
has_tmz_support(amdgpu_device_handle dev,struct radeon_info * info,uint32_t ids_flags)367 has_tmz_support(amdgpu_device_handle dev, struct radeon_info *info, uint32_t ids_flags)
368 {
369 struct amdgpu_bo_alloc_request request = {0};
370 int r;
371 amdgpu_bo_handle bo;
372
373 if (ids_flags & AMDGPU_IDS_FLAGS_TMZ)
374 return true;
375
376 /* AMDGPU_IDS_FLAGS_TMZ is supported starting from drm_minor 40 */
377 if (info->drm_minor >= 40)
378 return false;
379
380 /* Find out ourselves if TMZ is enabled */
381 if (info->gfx_level < GFX9)
382 return false;
383
384 if (info->drm_minor < 36)
385 return false;
386
387 request.alloc_size = 256;
388 request.phys_alignment = 1024;
389 request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
390 request.flags = AMDGPU_GEM_CREATE_ENCRYPTED;
391 r = amdgpu_bo_alloc(dev, &request, &bo);
392 if (r)
393 return false;
394 amdgpu_bo_free(bo);
395 return true;
396 }
397
set_custom_cu_en_mask(struct radeon_info * info)398 static void set_custom_cu_en_mask(struct radeon_info *info)
399 {
400 info->spi_cu_en = ~0;
401
402 const char *cu_env_var = os_get_option("AMD_CU_MASK");
403 if (!cu_env_var)
404 return;
405
406 int size = strlen(cu_env_var);
407 char *str = alloca(size + 1);
408 memset(str, 0, size + 1);
409
410 size = 0;
411
412 /* Strip whitespace. */
413 for (unsigned src = 0; cu_env_var[src]; src++) {
414 if (cu_env_var[src] != ' ' && cu_env_var[src] != '\t' &&
415 cu_env_var[src] != '\n' && cu_env_var[src] != '\r') {
416 str[size++] = cu_env_var[src];
417 }
418 }
419
420 /* The following syntax is used, all whitespace is ignored:
421 * ID = [0-9][0-9]* ex. base 10 numbers
422 * ID_list = (ID | ID-ID)[, (ID | ID-ID)]* ex. 0,2-4,7
423 * CU_list = 0x[0-F]* | ID_list ex. 0x337F OR 0,2-4,7
424 * AMD_CU_MASK = CU_list
425 *
426 * It's a CU mask within a shader array. It's applied to all shader arrays.
427 */
428 bool is_good_form = true;
429 uint32_t spi_cu_en = 0;
430
431 if (size > 2 && str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) {
432 str += 2;
433 size -= 2;
434
435 for (unsigned i = 0; i < size; i++)
436 is_good_form &= isxdigit(str[i]) != 0;
437
438 if (!is_good_form) {
439 fprintf(stderr, "amd: invalid AMD_CU_MASK: ill-formed hex value\n");
440 } else {
441 spi_cu_en = strtol(str, NULL, 16);
442 }
443 } else {
444 /* Parse ID_list. */
445 long first = 0, last = -1;
446
447 if (!isdigit(*str)) {
448 is_good_form = false;
449 } else {
450 while (*str) {
451 bool comma = false;
452
453 if (isdigit(*str)) {
454 first = last = strtol(str, &str, 10);
455 } else if (*str == '-') {
456 str++;
457 /* Parse a digit after a dash. */
458 if (isdigit(*str)) {
459 last = strtol(str, &str, 10);
460 } else {
461 fprintf(stderr, "amd: invalid AMD_CU_MASK: expected a digit after -\n");
462 is_good_form = false;
463 break;
464 }
465 } else if (*str == ',') {
466 comma = true;
467 str++;
468 if (!isdigit(*str)) {
469 fprintf(stderr, "amd: invalid AMD_CU_MASK: expected a digit after ,\n");
470 is_good_form = false;
471 break;
472 }
473 }
474
475 if (comma || !*str) {
476 if (first > last) {
477 fprintf(stderr, "amd: invalid AMD_CU_MASK: range not increasing (%li, %li)\n", first, last);
478 is_good_form = false;
479 break;
480 }
481 if (last > 31) {
482 fprintf(stderr, "amd: invalid AMD_CU_MASK: index too large (%li)\n", last);
483 is_good_form = false;
484 break;
485 }
486
487 spi_cu_en |= BITFIELD_RANGE(first, last - first + 1);
488 last = -1;
489 }
490 }
491 }
492 }
493
494 /* The mask is parsed. Now assign bits to CUs. */
495 if (is_good_form) {
496 bool error = false;
497
498 /* Clear bits that have no effect. */
499 spi_cu_en &= BITFIELD_MASK(info->max_good_cu_per_sa);
500
501 if (!spi_cu_en) {
502 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU in each SA must be enabled\n");
503 error = true;
504 }
505
506 if (info->has_graphics) {
507 uint32_t min_full_cu_mask = BITFIELD_MASK(info->min_good_cu_per_sa);
508
509 /* The hw ignores all non-compute CU masks if any of them is 0. Disallow that. */
510 if ((spi_cu_en & min_full_cu_mask) == 0) {
511 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be "
512 "enabled (SPI limitation)\n", min_full_cu_mask);
513 error = true;
514 }
515
516 /* We usually disable 1 or 2 CUs for VS and GS, which means at last 1 other CU
517 * must be enabled.
518 */
519 uint32_t cu_mask_ge, unused;
520 ac_compute_late_alloc(info, false, false, false, &unused, &cu_mask_ge);
521 cu_mask_ge &= min_full_cu_mask;
522
523 if ((spi_cu_en & cu_mask_ge) == 0) {
524 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be "
525 "enabled (late alloc constraint for GE)\n", cu_mask_ge);
526 error = true;
527 }
528
529 if ((min_full_cu_mask & spi_cu_en & ~cu_mask_ge) == 0) {
530 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be "
531 "enabled (late alloc constraint for PS)\n",
532 min_full_cu_mask & ~cu_mask_ge);
533 error = true;
534 }
535 }
536
537 if (!error) {
538 info->spi_cu_en = spi_cu_en;
539 info->spi_cu_en_has_effect = spi_cu_en & BITFIELD_MASK(info->max_good_cu_per_sa);
540 }
541 }
542 }
543
ac_query_pci_bus_info(int fd,struct radeon_info * info)544 static bool ac_query_pci_bus_info(int fd, struct radeon_info *info)
545 {
546 drmDevicePtr devinfo;
547
548 /* Get PCI info. */
549 int r = drmGetDevice2(fd, 0, &devinfo);
550 if (r) {
551 fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
552 info->pci.valid = false;
553 return false;
554 }
555 info->pci.domain = devinfo->businfo.pci->domain;
556 info->pci.bus = devinfo->businfo.pci->bus;
557 info->pci.dev = devinfo->businfo.pci->dev;
558 info->pci.func = devinfo->businfo.pci->func;
559 info->pci.valid = true;
560
561 drmFreeDevice(&devinfo);
562 return true;
563 }
564
handle_env_var_force_family(struct radeon_info * info)565 static void handle_env_var_force_family(struct radeon_info *info)
566 {
567 const char *family = debug_get_option("AMD_FORCE_FAMILY", NULL);
568
569 if (!family)
570 return;
571
572 for (unsigned i = CHIP_TAHITI; i < CHIP_LAST; i++) {
573 if (!strcmp(family, ac_get_llvm_processor_name(i))) {
574 /* Override family and gfx_level. */
575 info->family = i;
576 info->name = "NOOP";
577 info->gfx_level = ac_get_gfx_level(i);
578 info->family_id = ac_get_family_id(i);
579 info->family_overridden = true;
580 return;
581 }
582 }
583
584 fprintf(stderr, "radeonsi: Unknown family: %s\n", family);
585 exit(1);
586 }
587
ac_query_gpu_info(int fd,void * dev_p,struct radeon_info * info,bool require_pci_bus_info)588 bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
589 bool require_pci_bus_info)
590 {
591 struct amdgpu_gpu_info amdinfo;
592 struct drm_amdgpu_info_device device_info = {0};
593 struct amdgpu_buffer_size_alignments alignment_info = {0};
594 uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0;
595 int r, i, j;
596 amdgpu_device_handle dev = dev_p;
597
598 STATIC_ASSERT(AMDGPU_HW_IP_GFX == AMD_IP_GFX);
599 STATIC_ASSERT(AMDGPU_HW_IP_COMPUTE == AMD_IP_COMPUTE);
600 STATIC_ASSERT(AMDGPU_HW_IP_DMA == AMD_IP_SDMA);
601 STATIC_ASSERT(AMDGPU_HW_IP_UVD == AMD_IP_UVD);
602 STATIC_ASSERT(AMDGPU_HW_IP_VCE == AMD_IP_VCE);
603 STATIC_ASSERT(AMDGPU_HW_IP_UVD_ENC == AMD_IP_UVD_ENC);
604 STATIC_ASSERT(AMDGPU_HW_IP_VCN_DEC == AMD_IP_VCN_DEC);
605 STATIC_ASSERT(AMDGPU_HW_IP_VCN_ENC == AMD_IP_VCN_ENC);
606 STATIC_ASSERT(AMDGPU_HW_IP_VCN_JPEG == AMD_IP_VCN_JPEG);
607 STATIC_ASSERT(AMDGPU_HW_IP_VPE == AMD_IP_VPE);
608
609 handle_env_var_force_family(info);
610
611 if (!ac_query_pci_bus_info(fd, info)) {
612 if (require_pci_bus_info)
613 return false;
614 }
615
616 assert(info->drm_major == 3);
617 info->is_amdgpu = true;
618
619 if (info->drm_minor < 27) {
620 fprintf(stderr, "amdgpu: DRM version is %u.%u.%u, but this driver is "
621 "only compatible with 3.27.0 (kernel 4.20+) or later.\n",
622 info->drm_major, info->drm_minor, info->drm_patchlevel);
623 return false;
624 }
625
626 uint64_t cap;
627 r = drmGetCap(fd, DRM_CAP_SYNCOBJ, &cap);
628 if (r != 0 || cap == 0) {
629 fprintf(stderr, "amdgpu: syncobj support is missing but is required.\n");
630 return false;
631 }
632
633 /* Query hardware and driver information. */
634 r = amdgpu_query_gpu_info(dev, &amdinfo);
635 if (r) {
636 fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
637 return false;
638 }
639
640 r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), &device_info);
641 if (r) {
642 fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n");
643 return false;
644 }
645
646 r = amdgpu_query_buffer_size_alignment(dev, &alignment_info);
647 if (r) {
648 fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
649 return false;
650 }
651
652 for (unsigned ip_type = 0; ip_type < AMD_NUM_IP_TYPES; ip_type++) {
653 struct drm_amdgpu_info_hw_ip ip_info = {0};
654
655 r = amdgpu_query_hw_ip_info(dev, ip_type, 0, &ip_info);
656 if (r || !ip_info.available_rings)
657 continue;
658
659 /* Gfx6-8 don't set ip_discovery_version. */
660 if (info->drm_minor >= 48 && ip_info.ip_discovery_version) {
661 info->ip[ip_type].ver_major = (ip_info.ip_discovery_version >> 16) & 0xff;
662 info->ip[ip_type].ver_minor = (ip_info.ip_discovery_version >> 8) & 0xff;
663 info->ip[ip_type].ver_rev = ip_info.ip_discovery_version & 0xff;
664 } else {
665 info->ip[ip_type].ver_major = ip_info.hw_ip_version_major;
666 info->ip[ip_type].ver_minor = ip_info.hw_ip_version_minor;
667
668 /* Fix incorrect IP versions reported by the kernel. */
669 if (device_info.family == FAMILY_NV &&
670 (ASICREV_IS(device_info.external_rev, NAVI10) ||
671 ASICREV_IS(device_info.external_rev, NAVI12) ||
672 ASICREV_IS(device_info.external_rev, NAVI14)))
673 info->ip[AMD_IP_GFX].ver_minor = info->ip[AMD_IP_COMPUTE].ver_minor = 1;
674 else if (device_info.family == FAMILY_NV ||
675 device_info.family == FAMILY_VGH ||
676 device_info.family == FAMILY_RMB ||
677 device_info.family == FAMILY_RPL ||
678 device_info.family == FAMILY_MDN)
679 info->ip[AMD_IP_GFX].ver_minor = info->ip[AMD_IP_COMPUTE].ver_minor = 3;
680 }
681 info->ip[ip_type].num_queues = util_bitcount(ip_info.available_rings);
682
683 /* According to the kernel, only SDMA and VPE require 256B alignment, but use it
684 * for all queues because the kernel reports wrong limits for some of the queues.
685 * This is only space allocation alignment, so it's OK to keep it like this even
686 * when it's greater than what the queues require.
687 */
688 info->ip[ip_type].ib_alignment = MAX3(ip_info.ib_start_alignment,
689 ip_info.ib_size_alignment, 256);
690 }
691
692 /* Set dword padding minus 1. */
693 info->ip[AMD_IP_GFX].ib_pad_dw_mask = 0x7;
694 info->ip[AMD_IP_COMPUTE].ib_pad_dw_mask = 0x7;
695 info->ip[AMD_IP_SDMA].ib_pad_dw_mask = 0xf;
696 info->ip[AMD_IP_UVD].ib_pad_dw_mask = 0xf;
697 info->ip[AMD_IP_VCE].ib_pad_dw_mask = 0x3f;
698 info->ip[AMD_IP_UVD_ENC].ib_pad_dw_mask = 0x3f;
699 info->ip[AMD_IP_VCN_DEC].ib_pad_dw_mask = 0xf;
700 info->ip[AMD_IP_VCN_ENC].ib_pad_dw_mask = 0x3f;
701 info->ip[AMD_IP_VCN_JPEG].ib_pad_dw_mask = 0xf;
702 info->ip[AMD_IP_VPE].ib_pad_dw_mask = 0xf;
703
704 /* Only require gfx or compute. */
705 if (!info->ip[AMD_IP_GFX].num_queues && !info->ip[AMD_IP_COMPUTE].num_queues) {
706 fprintf(stderr, "amdgpu: failed to find gfx or compute.\n");
707 return false;
708 }
709
710 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version,
711 &info->me_fw_feature);
712 if (r) {
713 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n");
714 return false;
715 }
716
717 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_MEC, 0, 0, &info->mec_fw_version,
718 &info->mec_fw_feature);
719 if (r) {
720 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(mec) failed.\n");
721 return false;
722 }
723
724 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, &info->pfp_fw_version,
725 &info->pfp_fw_feature);
726 if (r) {
727 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n");
728 return false;
729 }
730
731 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, &uvd_version, &uvd_feature);
732 if (r) {
733 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n");
734 return false;
735 }
736
737 r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, &vce_version, &vce_feature);
738 if (r) {
739 fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
740 return false;
741 }
742
743 r = amdgpu_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi);
744 if (r) {
745 fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n");
746 return false;
747 }
748
749 struct drm_amdgpu_memory_info meminfo = {0};
750
751 r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
752 if (r) {
753 fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n");
754 return false;
755 }
756
757 /* Note: usable_heap_size values can be random and can't be relied on. */
758 info->gart_size_kb = DIV_ROUND_UP(meminfo.gtt.total_heap_size, 1024);
759 info->vram_size_kb = DIV_ROUND_UP(fix_vram_size(meminfo.vram.total_heap_size), 1024);
760 info->vram_vis_size_kb = DIV_ROUND_UP(meminfo.cpu_accessible_vram.total_heap_size, 1024);
761
762 if (info->drm_minor >= 41) {
763 amdgpu_query_video_caps_info(dev, AMDGPU_INFO_VIDEO_CAPS_DECODE,
764 sizeof(info->dec_caps), &(info->dec_caps));
765 amdgpu_query_video_caps_info(dev, AMDGPU_INFO_VIDEO_CAPS_ENCODE,
766 sizeof(info->enc_caps), &(info->enc_caps));
767 }
768
769 /* Add some margin of error, though this shouldn't be needed in theory. */
770 info->all_vram_visible = info->vram_size_kb * 0.9 < info->vram_vis_size_kb;
771
772 /* Set chip identification. */
773 info->pci_id = device_info.device_id;
774 info->pci_rev_id = device_info.pci_rev;
775 info->vce_harvest_config = device_info.vce_harvest_config;
776
777 #define identify_chip2(asic, chipname) \
778 if (ASICREV_IS(device_info.external_rev, asic)) { \
779 info->family = CHIP_##chipname; \
780 info->name = #chipname; \
781 }
782 #define identify_chip(chipname) identify_chip2(chipname, chipname)
783
784 if (!info->family_overridden) {
785 switch (device_info.family) {
786 case FAMILY_SI:
787 identify_chip(TAHITI);
788 identify_chip(PITCAIRN);
789 identify_chip2(CAPEVERDE, VERDE);
790 identify_chip(OLAND);
791 identify_chip(HAINAN);
792 break;
793 case FAMILY_CI:
794 identify_chip(BONAIRE);
795 identify_chip(HAWAII);
796 break;
797 case FAMILY_KV:
798 identify_chip2(SPECTRE, KAVERI);
799 identify_chip2(SPOOKY, KAVERI);
800 identify_chip2(KALINDI, KABINI);
801 identify_chip2(GODAVARI, KABINI);
802 break;
803 case FAMILY_VI:
804 identify_chip(ICELAND);
805 identify_chip(TONGA);
806 identify_chip(FIJI);
807 identify_chip(POLARIS10);
808 identify_chip(POLARIS11);
809 identify_chip(POLARIS12);
810 identify_chip(VEGAM);
811 break;
812 case FAMILY_CZ:
813 identify_chip(CARRIZO);
814 identify_chip(STONEY);
815 break;
816 case FAMILY_AI:
817 identify_chip(VEGA10);
818 identify_chip(VEGA12);
819 identify_chip(VEGA20);
820 identify_chip(MI100);
821 identify_chip(MI200);
822 identify_chip(GFX940);
823 break;
824 case FAMILY_RV:
825 identify_chip(RAVEN);
826 identify_chip(RAVEN2);
827 identify_chip(RENOIR);
828 break;
829 case FAMILY_NV:
830 identify_chip(NAVI10);
831 identify_chip(NAVI12);
832 identify_chip(NAVI14);
833 identify_chip(NAVI21);
834 identify_chip(NAVI22);
835 identify_chip(NAVI23);
836 identify_chip(NAVI24);
837 break;
838 case FAMILY_VGH:
839 identify_chip(VANGOGH);
840 break;
841 case FAMILY_RMB:
842 identify_chip(REMBRANDT);
843 break;
844 case FAMILY_RPL:
845 identify_chip2(RAPHAEL, RAPHAEL_MENDOCINO);
846 break;
847 case FAMILY_MDN:
848 identify_chip2(MENDOCINO, RAPHAEL_MENDOCINO);
849 break;
850 case FAMILY_NV3:
851 identify_chip(NAVI31);
852 identify_chip(NAVI32);
853 identify_chip(NAVI33);
854 break;
855 case FAMILY_GFX1103:
856 identify_chip(GFX1103_R1);
857 identify_chip(GFX1103_R2);
858 break;
859 case FAMILY_GFX1150:
860 identify_chip(GFX1150);
861 break;
862 }
863
864 if (info->ip[AMD_IP_GFX].ver_major == 11 && info->ip[AMD_IP_GFX].ver_minor == 5)
865 info->gfx_level = GFX11_5;
866 else if (info->ip[AMD_IP_GFX].ver_major == 11 && info->ip[AMD_IP_GFX].ver_minor == 0)
867 info->gfx_level = GFX11;
868 else if (info->ip[AMD_IP_GFX].ver_major == 10 && info->ip[AMD_IP_GFX].ver_minor == 3)
869 info->gfx_level = GFX10_3;
870 else if (info->ip[AMD_IP_GFX].ver_major == 10 && info->ip[AMD_IP_GFX].ver_minor == 1)
871 info->gfx_level = GFX10;
872 else if (info->ip[AMD_IP_GFX].ver_major == 9 || info->ip[AMD_IP_COMPUTE].ver_major == 9)
873 info->gfx_level = GFX9;
874 else if (info->ip[AMD_IP_GFX].ver_major == 8)
875 info->gfx_level = GFX8;
876 else if (info->ip[AMD_IP_GFX].ver_major == 7)
877 info->gfx_level = GFX7;
878 else if (info->ip[AMD_IP_GFX].ver_major == 6)
879 info->gfx_level = GFX6;
880 else {
881 fprintf(stderr, "amdgpu: Unknown gfx version: %u.%u\n",
882 info->ip[AMD_IP_GFX].ver_major, info->ip[AMD_IP_GFX].ver_minor);
883 return false;
884 }
885
886 info->family_id = device_info.family;
887 info->chip_external_rev = device_info.external_rev;
888 info->chip_rev = device_info.chip_rev;
889 info->marketing_name = amdgpu_get_marketing_name(dev);
890 info->is_pro_graphics = info->marketing_name && (strstr(info->marketing_name, "Pro") ||
891 strstr(info->marketing_name, "PRO") ||
892 strstr(info->marketing_name, "Frontier"));
893 }
894
895 if (!info->name) {
896 fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n",
897 device_info.family, device_info.external_rev);
898 return false;
899 }
900
901 memset(info->lowercase_name, 0, sizeof(info->lowercase_name));
902 for (unsigned i = 0; info->name[i] && i < ARRAY_SIZE(info->lowercase_name) - 1; i++)
903 info->lowercase_name[i] = tolower(info->name[i]);
904
905 char proc_fd[64];
906 snprintf(proc_fd, sizeof(proc_fd), "/proc/self/fd/%u", fd);
907 UNUSED int _result = readlink(proc_fd, info->dev_filename, sizeof(info->dev_filename));
908
909 #define VCN_IP_VERSION(mj, mn, rv) (((mj) << 16) | ((mn) << 8) | (rv))
910
911 for (unsigned i = AMD_IP_VCN_DEC; i <= AMD_IP_VCN_JPEG; ++i) {
912 if (!info->ip[i].num_queues)
913 continue;
914
915 switch(VCN_IP_VERSION(info->ip[i].ver_major,
916 info->ip[i].ver_minor,
917 info->ip[i].ver_rev)) {
918 case VCN_IP_VERSION(1, 0, 0):
919 info->vcn_ip_version = VCN_1_0_0;
920 break;
921 case VCN_IP_VERSION(1, 0, 1):
922 info->vcn_ip_version = VCN_1_0_1;
923 break;
924 case VCN_IP_VERSION(2, 0, 0):
925 info->vcn_ip_version = VCN_2_0_0;
926 break;
927 case VCN_IP_VERSION(2, 0, 2):
928 info->vcn_ip_version = VCN_2_0_2;
929 break;
930 case VCN_IP_VERSION(2, 0, 3):
931 info->vcn_ip_version = VCN_2_0_3;
932 break;
933 case VCN_IP_VERSION(2, 2, 0):
934 info->vcn_ip_version = VCN_2_2_0;
935 break;
936 case VCN_IP_VERSION(2, 5, 0):
937 info->vcn_ip_version = VCN_2_5_0;
938 break;
939 case VCN_IP_VERSION(2, 6, 0):
940 info->vcn_ip_version = VCN_2_6_0;
941 break;
942 case VCN_IP_VERSION(3, 0, 0):
943 /* Navi24 version need to be revised if it fallbacks to the older way
944 * with default version as 3.0.0, since Navi24 has different feature
945 * sets from other VCN3 family */
946 info->vcn_ip_version = (info->family != CHIP_NAVI24) ? VCN_3_0_0 : VCN_3_0_33;
947 break;
948 case VCN_IP_VERSION(3, 0, 2):
949 info->vcn_ip_version = VCN_3_0_2;
950 break;
951 case VCN_IP_VERSION(3, 0, 16):
952 info->vcn_ip_version = VCN_3_0_16;
953 break;
954 case VCN_IP_VERSION(3, 0, 33):
955 info->vcn_ip_version = VCN_3_0_33;
956 break;
957 case VCN_IP_VERSION(3, 1, 1):
958 info->vcn_ip_version = VCN_3_1_1;
959 break;
960 case VCN_IP_VERSION(3, 1, 2):
961 info->vcn_ip_version = VCN_3_1_2;
962 break;
963 case VCN_IP_VERSION(4, 0, 0):
964 info->vcn_ip_version = VCN_4_0_0;
965 break;
966 case VCN_IP_VERSION(4, 0, 2):
967 info->vcn_ip_version = VCN_4_0_2;
968 break;
969 case VCN_IP_VERSION(4, 0, 3):
970 info->vcn_ip_version = VCN_4_0_3;
971 break;
972 case VCN_IP_VERSION(4, 0, 4):
973 info->vcn_ip_version = VCN_4_0_4;
974 break;
975 case VCN_IP_VERSION(4, 0, 5):
976 info->vcn_ip_version = VCN_4_0_5;
977 break;
978 default:
979 info->vcn_ip_version = VCN_UNKNOWN;
980 }
981 break;
982 }
983
984 /* Set which chips have dedicated VRAM. */
985 info->has_dedicated_vram = !(device_info.ids_flags & AMDGPU_IDS_FLAGS_FUSION);
986
987 /* The kernel can split large buffers in VRAM but not in GTT, so large
988 * allocations can fail or cause buffer movement failures in the kernel.
989 */
990 if (info->has_dedicated_vram)
991 info->max_heap_size_kb = info->vram_size_kb;
992 else
993 info->max_heap_size_kb = info->gart_size_kb;
994
995 info->vram_type = device_info.vram_type;
996 info->memory_bus_width = device_info.vram_bit_width;
997
998 /* Set which chips have uncached device memory. */
999 info->has_l2_uncached = info->gfx_level >= GFX9;
1000
1001 /* Set hardware information. */
1002 /* convert the shader/memory clocks from KHz to MHz */
1003 info->max_gpu_freq_mhz = device_info.max_engine_clock / 1000;
1004 info->memory_freq_mhz_effective = info->memory_freq_mhz = device_info.max_memory_clock / 1000;
1005 info->max_tcc_blocks = device_info.num_tcc_blocks;
1006 info->max_se = device_info.num_shader_engines;
1007 info->max_sa_per_se = device_info.num_shader_arrays_per_engine;
1008 info->num_cu_per_sh = device_info.num_cu_per_sh;
1009 info->uvd_fw_version = info->ip[AMD_IP_UVD].num_queues ? uvd_version : 0;
1010 info->vce_fw_version = info->ip[AMD_IP_VCE].num_queues ? vce_version : 0;
1011
1012 info->memory_freq_mhz_effective *= ac_memory_ops_per_clock(info->vram_type);
1013
1014 info->has_userptr = true;
1015 info->has_timeline_syncobj = has_timeline_syncobj(fd);
1016 info->has_local_buffers = true;
1017 info->has_bo_metadata = true;
1018 info->has_eqaa_surface_allocator = info->gfx_level < GFX11;
1019 /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once
1020 * these faults are mitigated in software.
1021 */
1022 info->has_sparse_vm_mappings = info->gfx_level >= GFX7;
1023 info->has_scheduled_fence_dependency = info->drm_minor >= 28;
1024 info->has_gang_submit = info->drm_minor >= 49;
1025 info->has_gpuvm_fault_query = info->drm_minor >= 55;
1026 info->has_tmz_support = has_tmz_support(dev, info, device_info.ids_flags);
1027 info->kernel_has_modifiers = has_modifiers(fd);
1028 info->uses_kernel_cu_mask = false; /* Not implemented in the kernel. */
1029 info->has_graphics = info->ip[AMD_IP_GFX].num_queues > 0;
1030
1031 info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override;
1032 info->max_render_backends = device_info.num_rb_pipes;
1033 /* The value returned by the kernel driver was wrong. */
1034 if (info->family == CHIP_KAVERI)
1035 info->max_render_backends = 2;
1036
1037 info->clock_crystal_freq = device_info.gpu_counter_freq;
1038 if (!info->clock_crystal_freq) {
1039 fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
1040 info->clock_crystal_freq = 1;
1041 }
1042 if (info->gfx_level >= GFX10) {
1043 info->tcc_cache_line_size = 128;
1044
1045 if (info->drm_minor >= 35) {
1046 info->num_tcc_blocks = info->max_tcc_blocks - util_bitcount64(device_info.tcc_disabled_mask);
1047 } else {
1048 /* This is a hack, but it's all we can do without a kernel upgrade. */
1049 info->num_tcc_blocks = info->vram_size_kb / (512 * 1024);
1050 if (info->num_tcc_blocks > info->max_tcc_blocks)
1051 info->num_tcc_blocks /= 2;
1052 }
1053 } else {
1054 if (!info->has_graphics && info->family >= CHIP_MI200)
1055 info->tcc_cache_line_size = 128;
1056 else
1057 info->tcc_cache_line_size = 64;
1058
1059 info->num_tcc_blocks = info->max_tcc_blocks;
1060 }
1061
1062 info->tcc_rb_non_coherent = !util_is_power_of_two_or_zero(info->num_tcc_blocks);
1063
1064 if (info->drm_minor >= 52) {
1065 info->sqc_inst_cache_size = device_info.sqc_inst_cache_size * 1024;
1066 info->sqc_scalar_cache_size = device_info.sqc_data_cache_size * 1024;
1067 info->num_sqc_per_wgp = device_info.num_sqc_per_wgp;
1068 }
1069
1070 /* Firmware wrongly reports 0 bytes of MALL being present on Navi33.
1071 * Work around this by manually computing cache sizes. */
1072 if (info->gfx_level >= GFX11 && info->drm_minor >= 52 && info->family != CHIP_NAVI33) {
1073 info->tcp_cache_size = device_info.tcp_cache_size * 1024;
1074 info->l1_cache_size = device_info.gl1c_cache_size * 1024;
1075 info->l2_cache_size = device_info.gl2c_cache_size * 1024;
1076 info->l3_cache_size_mb = DIV_ROUND_UP(device_info.mall_size, 1024 * 1024);
1077 } else {
1078 if (info->gfx_level >= GFX11) {
1079 info->tcp_cache_size = 32768;
1080 info->l1_cache_size = 256 * 1024;
1081 } else {
1082 info->tcp_cache_size = 16384;
1083 info->l1_cache_size = 128 * 1024;
1084 }
1085
1086 if (info->gfx_level >= GFX10_3 && info->has_dedicated_vram) {
1087 info->l3_cache_size_mb = info->num_tcc_blocks *
1088 (info->family == CHIP_NAVI21 ||
1089 info->family == CHIP_NAVI22 ? 8 : 4);
1090 }
1091
1092 switch (info->family) {
1093 case CHIP_TAHITI:
1094 case CHIP_PITCAIRN:
1095 case CHIP_OLAND:
1096 case CHIP_HAWAII:
1097 case CHIP_KABINI:
1098 case CHIP_TONGA:
1099 case CHIP_STONEY:
1100 case CHIP_RAVEN2:
1101 info->l2_cache_size = info->num_tcc_blocks * 64 * 1024;
1102 break;
1103 case CHIP_VERDE:
1104 case CHIP_HAINAN:
1105 case CHIP_BONAIRE:
1106 case CHIP_KAVERI:
1107 case CHIP_ICELAND:
1108 case CHIP_CARRIZO:
1109 case CHIP_FIJI:
1110 case CHIP_POLARIS12:
1111 case CHIP_VEGAM:
1112 case CHIP_RAPHAEL_MENDOCINO:
1113 info->l2_cache_size = info->num_tcc_blocks * 128 * 1024;
1114 break;
1115 default:
1116 info->l2_cache_size = info->num_tcc_blocks * 256 * 1024;
1117 break;
1118 case CHIP_REMBRANDT:
1119 case CHIP_GFX1103_R1:
1120 info->l2_cache_size = info->num_tcc_blocks * 512 * 1024;
1121 break;
1122 }
1123 }
1124
1125 info->mc_arb_ramcfg = amdinfo.mc_arb_ramcfg;
1126 info->gb_addr_config = amdinfo.gb_addr_cfg;
1127 if (info->gfx_level >= GFX9) {
1128 info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(info->gb_addr_config);
1129 info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config);
1130 } else {
1131 unsigned pipe_config = G_009910_PIPE_CONFIG(amdinfo.gb_tile_mode[CIK_TILE_MODE_COLOR_2D]);
1132 info->num_tile_pipes = ac_pipe_config_to_num_pipes(pipe_config);
1133 info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config);
1134 }
1135 info->r600_has_virtual_memory = true;
1136
1137 /* LDS is 64KB per CU (4 SIMDs on GFX6-9), which is 16KB per SIMD (usage above
1138 * 16KB makes some SIMDs unoccupied).
1139 *
1140 * GFX10+: LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
1141 * GFX7+: Workgroups can use up to 64KB.
1142 * GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
1143 */
1144 info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024
1145 : info->gfx_level >= GFX7 ? 64 * 1024
1146 : 32 * 1024;
1147
1148 /* lds_encode_granularity is the block size used for encoding registers.
1149 * lds_alloc_granularity is what the hardware will align the LDS size to.
1150 */
1151 info->lds_encode_granularity = info->gfx_level >= GFX7 ? 128 * 4 : 64 * 4;
1152 info->lds_alloc_granularity = info->gfx_level >= GFX10_3 ? 256 * 4 : info->lds_encode_granularity;
1153
1154 /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
1155 * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
1156 * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel.
1157 */
1158 info->has_clear_state = info->gfx_level >= GFX7;
1159
1160 info->has_distributed_tess =
1161 info->gfx_level >= GFX10 || (info->gfx_level >= GFX8 && info->max_se >= 2);
1162
1163 info->has_dcc_constant_encode =
1164 info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->gfx_level >= GFX10;
1165
1166 info->has_rbplus = info->family == CHIP_STONEY || info->gfx_level >= GFX9;
1167
1168 /* Some chips have RB+ registers, but don't support RB+. Those must
1169 * always disable it.
1170 */
1171 info->rbplus_allowed =
1172 info->has_rbplus &&
1173 (info->family == CHIP_STONEY || info->family == CHIP_VEGA12 || info->family == CHIP_RAVEN ||
1174 info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->gfx_level >= GFX10_3);
1175
1176 info->has_out_of_order_rast =
1177 info->gfx_level >= GFX8 && info->gfx_level <= GFX9 && info->max_se >= 2;
1178
1179 /* Whether chips support double rate packed math instructions. */
1180 info->has_packed_math_16bit = info->gfx_level >= GFX9;
1181
1182 /* Whether chips support dot product instructions. A subset of these support a smaller
1183 * instruction encoding which accumulates with the destination.
1184 */
1185 info->has_accelerated_dot_product =
1186 info->family == CHIP_VEGA20 ||
1187 (info->family >= CHIP_MI100 && info->family != CHIP_NAVI10);
1188
1189 /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */
1190 info->has_load_ctx_reg_pkt =
1191 info->gfx_level >= GFX9 || (info->gfx_level >= GFX8 && info->me_fw_feature >= 41);
1192
1193 info->cpdma_prefetch_writes_memory = info->gfx_level <= GFX8;
1194
1195 info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
1196
1197 info->has_tc_compat_zrange_bug = info->gfx_level >= GFX8 && info->gfx_level <= GFX9;
1198
1199 info->has_small_prim_filter_sample_loc_bug =
1200 (info->family >= CHIP_POLARIS10 && info->family <= CHIP_POLARIS12) ||
1201 info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
1202
1203 info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
1204
1205 /* DB_DFSM_CONTROL.POPS_DRAIN_PS_ON_OVERLAP must be enabled for 8 or more coverage or
1206 * depth/stencil samples with POPS (PAL waMiscPopsMissedOverlap).
1207 */
1208 info->has_pops_missed_overlap_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
1209
1210 /* Drawing from 0-sized index buffers causes hangs on gfx10. */
1211 info->has_zero_index_buffer_bug = info->gfx_level == GFX10;
1212
1213 /* Whether chips are affected by the image load/sample/gather hw bug when
1214 * DCC is enabled (ie. WRITE_COMPRESS_ENABLE should be 0).
1215 */
1216 info->has_image_load_dcc_bug = info->family == CHIP_NAVI23 ||
1217 info->family == CHIP_VANGOGH ||
1218 info->family == CHIP_REMBRANDT;
1219
1220 /* DB has a bug when ITERATE_256 is set to 1 that can cause a hang. The
1221 * workaround is to set DECOMPRESS_ON_Z_PLANES to 2 for 4X MSAA D/S images.
1222 */
1223 info->has_two_planes_iterate256_bug = info->gfx_level == GFX10;
1224
1225 /* GFX10+Navi21: NGG->legacy transitions require VGT_FLUSH. */
1226 info->has_vgt_flush_ngg_legacy_bug = info->gfx_level == GFX10 ||
1227 info->family == CHIP_NAVI21;
1228
1229 /* First Navi2x chips have a hw bug that doesn't allow to write
1230 * depth/stencil from a FS for multi-pixel fragments.
1231 */
1232 info->has_vrs_ds_export_bug = info->family == CHIP_NAVI21 ||
1233 info->family == CHIP_NAVI22 ||
1234 info->family == CHIP_VANGOGH;
1235
1236 /* HW bug workaround when CS threadgroups > 256 threads and async compute
1237 * isn't used, i.e. only one compute job can run at a time. If async
1238 * compute is possible, the threadgroup size must be limited to 256 threads
1239 * on all queues to avoid the bug.
1240 * Only GFX6 and certain GFX7 chips are affected.
1241 *
1242 * FIXME: RADV doesn't limit the number of threads for async compute.
1243 */
1244 info->has_cs_regalloc_hang_bug = info->gfx_level == GFX6 ||
1245 info->family == CHIP_BONAIRE ||
1246 info->family == CHIP_KABINI;
1247
1248 /* HW bug workaround with async compute dispatches when threadgroup > 4096.
1249 * The workaround is to change the "threadgroup" dimension mode to "thread"
1250 * dimension mode.
1251 */
1252 info->has_async_compute_threadgroup_bug = info->family == CHIP_ICELAND ||
1253 info->family == CHIP_TONGA;
1254
1255 /* GFX7 CP requires 32 bytes alignment for the indirect buffer arguments on
1256 * the compute queue.
1257 */
1258 info->has_async_compute_align32_bug = info->gfx_level == GFX7;
1259
1260 /* Support for GFX10.3 was added with F32_ME_FEATURE_VERSION_31 but the
1261 * feature version wasn't bumped.
1262 */
1263 info->has_32bit_predication = (info->gfx_level >= GFX10 &&
1264 info->me_fw_feature >= 32) ||
1265 (info->gfx_level == GFX9 &&
1266 info->me_fw_feature >= 52);
1267
1268 /* Firmware bug with DISPATCH_TASKMESH_INDIRECT_MULTI_ACE packets.
1269 * On old MEC FW versions, it hangs the GPU when indirect count is zero.
1270 */
1271 info->has_taskmesh_indirect0_bug = info->gfx_level == GFX10_3 &&
1272 info->mec_fw_version < 100;
1273
1274 info->has_export_conflict_bug = info->gfx_level == GFX11;
1275
1276 /* Convert the SDMA version in the current GPU to an enum. */
1277 info->sdma_ip_version =
1278 (enum sdma_version)SDMA_VERSION_VALUE(info->ip[AMD_IP_SDMA].ver_major,
1279 info->ip[AMD_IP_SDMA].ver_minor);
1280
1281 /* SDMA v1.0-3.x (GFX6-8) can't ignore page faults on unmapped sparse resources. */
1282 info->sdma_supports_sparse = info->sdma_ip_version >= SDMA_4_0;
1283
1284 /* SDMA v5.0+ (GFX10+) supports DCC and HTILE, but Navi 10 has issues with it according to PAL. */
1285 info->sdma_supports_compression = info->sdma_ip_version >= SDMA_5_0 && info->family != CHIP_NAVI10;
1286
1287 /* Get the number of good compute units. */
1288 info->num_cu = 0;
1289 for (i = 0; i < info->max_se; i++) {
1290 for (j = 0; j < info->max_sa_per_se; j++) {
1291 if (info->gfx_level >= GFX11) {
1292 assert(info->max_sa_per_se <= 2);
1293 info->cu_mask[i][j] = device_info.cu_bitmap[i % 4][(i / 4) * 2 + j];
1294 } else if (info->family == CHIP_MI100) {
1295 /* The CU bitmap in amd gpu info structure is
1296 * 4x4 size array, and it's usually suitable for Vega
1297 * ASICs which has 4*2 SE/SA layout.
1298 * But for MI100, SE/SA layout is changed to 8*1.
1299 * To mostly reduce the impact, we make it compatible
1300 * with current bitmap array as below:
1301 * SE4 --> cu_bitmap[0][1]
1302 * SE5 --> cu_bitmap[1][1]
1303 * SE6 --> cu_bitmap[2][1]
1304 * SE7 --> cu_bitmap[3][1]
1305 */
1306 assert(info->max_sa_per_se == 1);
1307 info->cu_mask[i][0] = device_info.cu_bitmap[i % 4][i / 4];
1308 } else {
1309 info->cu_mask[i][j] = device_info.cu_bitmap[i][j];
1310 }
1311 info->num_cu += util_bitcount(info->cu_mask[i][j]);
1312 }
1313 }
1314
1315 /* Derive the number of enabled SEs from the CU mask. */
1316 if (info->gfx_level >= GFX10_3 && info->max_se > 1) {
1317 info->num_se = 0;
1318
1319 for (unsigned se = 0; se < info->max_se; se++) {
1320 for (unsigned sa = 0; sa < info->max_sa_per_se; sa++) {
1321 if (info->cu_mask[se][sa]) {
1322 info->num_se++;
1323 break;
1324 }
1325 }
1326 }
1327 } else {
1328 /* GFX10 and older always enable all SEs because they don't support SE harvesting. */
1329 info->num_se = info->max_se;
1330 }
1331
1332 /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled,
1333 * and max - min <= 2.
1334 */
1335 unsigned cu_group = info->gfx_level >= GFX10 ? 2 : 1;
1336 info->max_good_cu_per_sa =
1337 DIV_ROUND_UP(info->num_cu, (info->num_se * info->max_sa_per_se * cu_group)) *
1338 cu_group;
1339 info->min_good_cu_per_sa =
1340 (info->num_cu / (info->num_se * info->max_sa_per_se * cu_group)) * cu_group;
1341
1342 memcpy(info->si_tile_mode_array, amdinfo.gb_tile_mode, sizeof(amdinfo.gb_tile_mode));
1343
1344 info->enabled_rb_mask = device_info.enabled_rb_pipes_mask;
1345 if (info->drm_minor >= 52)
1346 info->enabled_rb_mask |= (uint64_t)device_info.enabled_rb_pipes_mask_hi << 32;
1347
1348 memcpy(info->cik_macrotile_mode_array, amdinfo.gb_macro_tile_mode,
1349 sizeof(amdinfo.gb_macro_tile_mode));
1350
1351 info->pte_fragment_size = alignment_info.size_local;
1352 info->gart_page_size = alignment_info.size_remote;
1353
1354 if (info->gfx_level == GFX6)
1355 info->gfx_ib_pad_with_type2 = true;
1356
1357 if (info->gfx_level >= GFX11) {
1358 /* With num_cu = 4 in gfx11 measured power for idle, video playback and observed
1359 * power savings, hence enable dcc with retile for gfx11 with num_cu >= 4.
1360 */
1361 info->use_display_dcc_with_retile_blit = info->num_cu >= 4;
1362 } else if (info->gfx_level == GFX10_3) {
1363 /* Displayable DCC with retiling is known to increase power consumption on Raphael
1364 * and Mendocino, so disable it on the smallest APUs. We need a proof that
1365 * displayable DCC doesn't regress bigger chips in the same way.
1366 */
1367 info->use_display_dcc_with_retile_blit = info->num_cu > 4;
1368 } else if (info->gfx_level == GFX9 && !info->has_dedicated_vram &&
1369 info->drm_minor >= 31) {
1370 if (info->max_render_backends == 1) {
1371 info->use_display_dcc_unaligned = true;
1372 } else {
1373 /* there may be power increase for small APUs with less num_cu. */
1374 info->use_display_dcc_with_retile_blit = info->num_cu > 4;
1375 }
1376 }
1377
1378 info->has_stable_pstate = info->drm_minor >= 45;
1379
1380 if (info->gfx_level >= GFX11) {
1381 info->pc_lines = 1024;
1382 info->pbb_max_alloc_count = 16; /* minimum is 2, maximum is 256 */
1383 } else if (info->gfx_level >= GFX9 && info->has_graphics) {
1384 unsigned pc_lines = 0;
1385
1386 switch (info->family) {
1387 case CHIP_VEGA10:
1388 case CHIP_VEGA12:
1389 case CHIP_VEGA20:
1390 pc_lines = 2048;
1391 break;
1392 case CHIP_RAVEN:
1393 case CHIP_RAVEN2:
1394 case CHIP_RENOIR:
1395 case CHIP_NAVI10:
1396 case CHIP_NAVI12:
1397 case CHIP_NAVI21:
1398 case CHIP_NAVI22:
1399 case CHIP_NAVI23:
1400 pc_lines = 1024;
1401 break;
1402 case CHIP_NAVI14:
1403 case CHIP_NAVI24:
1404 pc_lines = 512;
1405 break;
1406 case CHIP_VANGOGH:
1407 case CHIP_REMBRANDT:
1408 case CHIP_RAPHAEL_MENDOCINO:
1409 pc_lines = 256;
1410 break;
1411 default:
1412 assert(0);
1413 }
1414
1415 info->pc_lines = pc_lines;
1416
1417 if (info->gfx_level >= GFX10) {
1418 info->pbb_max_alloc_count = pc_lines / 3;
1419 } else {
1420 info->pbb_max_alloc_count = MIN2(128, pc_lines / (4 * info->max_se));
1421 }
1422 }
1423
1424 if (info->gfx_level >= GFX10_3)
1425 info->max_waves_per_simd = 16;
1426 else if (info->gfx_level == GFX10)
1427 info->max_waves_per_simd = 20;
1428 else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM)
1429 info->max_waves_per_simd = 8;
1430 else
1431 info->max_waves_per_simd = 10;
1432
1433 if (info->gfx_level >= GFX10) {
1434 info->num_physical_sgprs_per_simd = 128 * info->max_waves_per_simd;
1435 info->min_sgpr_alloc = 128;
1436 info->sgpr_alloc_granularity = 128;
1437 } else if (info->gfx_level >= GFX8) {
1438 info->num_physical_sgprs_per_simd = 800;
1439 info->min_sgpr_alloc = 16;
1440 info->sgpr_alloc_granularity = 16;
1441 } else {
1442 info->num_physical_sgprs_per_simd = 512;
1443 info->min_sgpr_alloc = 8;
1444 info->sgpr_alloc_granularity = 8;
1445 }
1446
1447 info->has_3d_cube_border_color_mipmap = info->has_graphics || info->family == CHIP_MI100;
1448 info->has_image_opcodes = debug_get_bool_option("AMD_IMAGE_OPCODES",
1449 info->has_graphics || info->family < CHIP_GFX940);
1450 info->never_stop_sq_perf_counters = info->gfx_level == GFX10 ||
1451 info->gfx_level == GFX10_3;
1452 info->never_send_perfcounter_stop = info->gfx_level == GFX11;
1453 info->has_sqtt_rb_harvest_bug = (info->family == CHIP_NAVI23 ||
1454 info->family == CHIP_NAVI24 ||
1455 info->family == CHIP_REMBRANDT ||
1456 info->family == CHIP_VANGOGH) &&
1457 util_bitcount64(info->enabled_rb_mask) !=
1458 info->max_render_backends;
1459
1460 /* On GFX10.3, the polarity of AUTO_FLUSH_MODE is inverted. */
1461 info->has_sqtt_auto_flush_mode_bug = info->gfx_level == GFX10_3;
1462
1463 info->max_sgpr_alloc = info->family == CHIP_TONGA || info->family == CHIP_ICELAND ? 96 : 104;
1464
1465 if (!info->has_graphics && info->family >= CHIP_MI200) {
1466 info->min_wave64_vgpr_alloc = 8;
1467 info->max_vgpr_alloc = 512;
1468 info->wave64_vgpr_alloc_granularity = 8;
1469 } else {
1470 info->min_wave64_vgpr_alloc = 4;
1471 info->max_vgpr_alloc = 256;
1472 info->wave64_vgpr_alloc_granularity = 4;
1473 }
1474
1475 /* Some GPU info was broken before DRM 3.45.0. */
1476 if (info->drm_minor >= 45 && device_info.num_shader_visible_vgprs) {
1477 /* The Gfx10 VGPR count is in Wave32, so divide it by 2 for Wave64.
1478 * Gfx6-9 numbers are in Wave64.
1479 */
1480 if (info->gfx_level >= GFX10)
1481 info->num_physical_wave64_vgprs_per_simd = device_info.num_shader_visible_vgprs / 2;
1482 else
1483 info->num_physical_wave64_vgprs_per_simd = device_info.num_shader_visible_vgprs;
1484 } else if (info->gfx_level >= GFX10) {
1485 info->num_physical_wave64_vgprs_per_simd = 512;
1486 } else {
1487 info->num_physical_wave64_vgprs_per_simd = 256;
1488 }
1489
1490 info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4;
1491
1492 /* BIG_PAGE is supported since gfx10.3 and requires VRAM. VRAM is only guaranteed
1493 * with AMDGPU_GEM_CREATE_DISCARDABLE. DISCARDABLE was added in DRM 3.47.0.
1494 */
1495 info->discardable_allows_big_page = info->gfx_level >= GFX10_3 &&
1496 info->has_dedicated_vram &&
1497 info->drm_minor >= 47;
1498
1499 /* The maximum number of scratch waves. The number is only a function of the number of CUs.
1500 * It should be large enough to hold at least 1 threadgroup. Use the minimum per-SA CU count.
1501 *
1502 * We can decrease the number to make it fit into the infinity cache.
1503 */
1504 const unsigned max_waves_per_tg = 32; /* 1024 threads in Wave32 */
1505 info->max_scratch_waves = MAX2(32 * info->min_good_cu_per_sa * info->max_sa_per_se * info->num_se,
1506 max_waves_per_tg);
1507 info->num_rb = util_bitcount64(info->enabled_rb_mask);
1508 info->max_gflops = (info->gfx_level >= GFX11 ? 256 : 128) * info->num_cu * info->max_gpu_freq_mhz / 1000;
1509 info->memory_bandwidth_gbps = DIV_ROUND_UP(info->memory_freq_mhz_effective * info->memory_bus_width / 8, 1000);
1510 info->has_pcie_bandwidth_info = info->drm_minor >= 51;
1511
1512 if (info->has_pcie_bandwidth_info) {
1513 info->pcie_gen = device_info.pcie_gen;
1514 info->pcie_num_lanes = device_info.pcie_num_lanes;
1515
1516 /* Source: https://en.wikipedia.org/wiki/PCI_Express#History_and_revisions */
1517 switch (info->pcie_gen) {
1518 case 1:
1519 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 0.25 * 1024;
1520 break;
1521 case 2:
1522 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 0.5 * 1024;
1523 break;
1524 case 3:
1525 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 0.985 * 1024;
1526 break;
1527 case 4:
1528 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 1.969 * 1024;
1529 break;
1530 case 5:
1531 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 3.938 * 1024;
1532 break;
1533 case 6:
1534 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 7.563 * 1024;
1535 break;
1536 case 7:
1537 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 15.125 * 1024;
1538 break;
1539 }
1540 }
1541
1542 /* The number of IBs per submit isn't infinite, it depends on the IP type
1543 * (ie. some initial setup needed for a submit) and the packet size.
1544 * It can be calculated according to the kernel source code as:
1545 * (ring->max_dw - emit_frame_size) / emit_ib_size
1546 */
1547 r = amdgpu_query_info(dev, AMDGPU_INFO_MAX_IBS,
1548 sizeof(info->max_submitted_ibs), info->max_submitted_ibs);
1549 if (r) {
1550 /* When the number of IBs can't be queried from the kernel, we choose a
1551 * rough estimate that should work well (as of kernel 6.3).
1552 */
1553 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; ++i)
1554 info->max_submitted_ibs[i] = 50;
1555
1556 info->max_submitted_ibs[AMD_IP_GFX] = info->gfx_level >= GFX7 ? 192 : 144;
1557 info->max_submitted_ibs[AMD_IP_COMPUTE] = 124;
1558 info->max_submitted_ibs[AMD_IP_VCN_JPEG] = 16;
1559 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; ++i) {
1560 /* Clear out max submitted IB count for IPs that have no queues. */
1561 if (!info->ip[i].num_queues)
1562 info->max_submitted_ibs[i] = 0;
1563 }
1564 }
1565
1566 if (info->gfx_level >= GFX11) {
1567 if (info->l3_cache_size_mb) {
1568 info->attribute_ring_size_per_se = 1400 * 1024;
1569 } else {
1570 assert(info->num_se == 1);
1571
1572 if (info->l2_cache_size >= 2 * 1024 * 1024)
1573 info->attribute_ring_size_per_se = 768 * 1024;
1574 else
1575 info->attribute_ring_size_per_se = info->l2_cache_size / 2;
1576 }
1577
1578 /* The size must be aligned to 64K per SE and must be at most 16M in total. */
1579 info->attribute_ring_size_per_se = align(info->attribute_ring_size_per_se, 64 * 1024);
1580 assert(info->attribute_ring_size_per_se * info->max_se <= 16 * 1024 * 1024);
1581
1582 info->conformant_trunc_coord =
1583 info->drm_minor >= 52 &&
1584 device_info.ids_flags & AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD;
1585 }
1586
1587 if (info->gfx_level >= GFX11 && device_info.shadow_size > 0) {
1588 info->has_fw_based_shadowing = true;
1589 info->fw_based_mcbp.shadow_size = device_info.shadow_size;
1590 info->fw_based_mcbp.shadow_alignment = device_info.shadow_alignment;
1591 info->fw_based_mcbp.csa_size = device_info.csa_size;
1592 info->fw_based_mcbp.csa_alignment = device_info.csa_alignment;
1593 }
1594
1595 /* WARNING: Register shadowing decreases performance by up to 50% on GFX11 with current FW. */
1596 info->register_shadowing_required = device_info.ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION &&
1597 info->gfx_level < GFX11;
1598
1599 if (info->gfx_level >= GFX11 && info->has_dedicated_vram) {
1600 info->has_set_context_pairs_packed = true;
1601 info->has_set_sh_pairs_packed = info->register_shadowing_required;
1602 }
1603
1604 set_custom_cu_en_mask(info);
1605
1606 const char *ib_filename = debug_get_option("AMD_PARSE_IB", NULL);
1607 if (ib_filename) {
1608 FILE *f = fopen(ib_filename, "r");
1609 if (f) {
1610 fseek(f, 0, SEEK_END);
1611 size_t size = ftell(f);
1612 uint32_t *ib = (uint32_t *)malloc(size);
1613 fseek(f, 0, SEEK_SET);
1614 size_t n_read = fread(ib, 1, size, f);
1615 fclose(f);
1616
1617 if (n_read != size) {
1618 fprintf(stderr, "failed to read %zu bytes from '%s'\n", size, ib_filename);
1619 exit(1);
1620 }
1621
1622 ac_parse_ib(stdout, ib, size / 4, NULL, 0, "IB", info->gfx_level, info->family,
1623 AMD_IP_GFX, NULL, NULL);
1624 free(ib);
1625 exit(0);
1626 }
1627 }
1628 return true;
1629 }
1630
ac_compute_driver_uuid(char * uuid,size_t size)1631 void ac_compute_driver_uuid(char *uuid, size_t size)
1632 {
1633 char amd_uuid[] = "AMD-MESA-DRV";
1634
1635 assert(size >= sizeof(amd_uuid));
1636
1637 memset(uuid, 0, size);
1638 strncpy(uuid, amd_uuid, size);
1639 }
1640
ac_compute_device_uuid(const struct radeon_info * info,char * uuid,size_t size)1641 void ac_compute_device_uuid(const struct radeon_info *info, char *uuid, size_t size)
1642 {
1643 uint32_t *uint_uuid = (uint32_t *)uuid;
1644
1645 assert(size >= sizeof(uint32_t) * 4);
1646
1647 /**
1648 * Use the device info directly instead of using a sha1. GL/VK UUIDs
1649 * are 16 byte vs 20 byte for sha1, and the truncation that would be
1650 * required would get rid of part of the little entropy we have.
1651 * */
1652 memset(uuid, 0, size);
1653 if (!info->pci.valid) {
1654 fprintf(stderr,
1655 "ac_compute_device_uuid's output is based on invalid pci bus info.\n");
1656 }
1657 uint_uuid[0] = info->pci.domain;
1658 uint_uuid[1] = info->pci.bus;
1659 uint_uuid[2] = info->pci.dev;
1660 uint_uuid[3] = info->pci.func;
1661 }
1662
ac_print_gpu_info(const struct radeon_info * info,FILE * f)1663 void ac_print_gpu_info(const struct radeon_info *info, FILE *f)
1664 {
1665 fprintf(f, "Device info:\n");
1666 fprintf(f, " name = %s\n", info->name);
1667 fprintf(f, " marketing_name = %s\n", info->marketing_name);
1668 fprintf(f, " dev_filename = %s\n", info->dev_filename);
1669 fprintf(f, " num_se = %i\n", info->num_se);
1670 fprintf(f, " num_rb = %i\n", info->num_rb);
1671 fprintf(f, " num_cu = %i\n", info->num_cu);
1672 fprintf(f, " max_gpu_freq = %i MHz\n", info->max_gpu_freq_mhz);
1673 fprintf(f, " max_gflops = %u GFLOPS\n", info->max_gflops);
1674
1675 if (info->sqc_inst_cache_size) {
1676 fprintf(f, " sqc_inst_cache_size = %i KB (%u per WGP)\n",
1677 DIV_ROUND_UP(info->sqc_inst_cache_size, 1024), info->num_sqc_per_wgp);
1678 }
1679 if (info->sqc_scalar_cache_size) {
1680 fprintf(f, " sqc_scalar_cache_size = %i KB (%u per WGP)\n",
1681 DIV_ROUND_UP(info->sqc_scalar_cache_size, 1024), info->num_sqc_per_wgp);
1682 }
1683
1684 fprintf(f, " tcp_cache_size = %i KB\n", DIV_ROUND_UP(info->tcp_cache_size, 1024));
1685
1686 if (info->gfx_level >= GFX10)
1687 fprintf(f, " l1_cache_size = %i KB\n", DIV_ROUND_UP(info->l1_cache_size, 1024));
1688
1689 fprintf(f, " l2_cache_size = %i KB\n", DIV_ROUND_UP(info->l2_cache_size, 1024));
1690
1691 if (info->l3_cache_size_mb)
1692 fprintf(f, " l3_cache_size = %i MB\n", info->l3_cache_size_mb);
1693
1694 fprintf(f, " memory_channels = %u (TCC blocks)\n", info->num_tcc_blocks);
1695 fprintf(f, " memory_size = %u GB (%u MB)\n",
1696 DIV_ROUND_UP(info->vram_size_kb, (1024 * 1024)),
1697 DIV_ROUND_UP(info->vram_size_kb, 1024));
1698 fprintf(f, " memory_freq = %u GHz\n", DIV_ROUND_UP(info->memory_freq_mhz_effective, 1000));
1699 fprintf(f, " memory_bus_width = %u bits\n", info->memory_bus_width);
1700 fprintf(f, " memory_bandwidth = %u GB/s\n", info->memory_bandwidth_gbps);
1701 fprintf(f, " pcie_gen = %u\n", info->pcie_gen);
1702 fprintf(f, " pcie_num_lanes = %u\n", info->pcie_num_lanes);
1703 fprintf(f, " pcie_bandwidth = %1.1f GB/s\n", info->pcie_bandwidth_mbps / 1024.0);
1704 fprintf(f, " clock_crystal_freq = %i KHz\n", info->clock_crystal_freq);
1705
1706 const char *ip_string[AMD_NUM_IP_TYPES] = {
1707 [AMD_IP_GFX] = "GFX",
1708 [AMD_IP_COMPUTE] = "COMP",
1709 [AMD_IP_SDMA] = "SDMA",
1710 [AMD_IP_UVD] = "UVD",
1711 [AMD_IP_VCE] = "VCE",
1712 [AMD_IP_UVD_ENC] = "UVD_ENC",
1713 [AMD_IP_VCN_DEC] = "VCN_DEC",
1714 [AMD_IP_VCN_ENC] = (info->vcn_ip_version >= VCN_4_0_0) ? "VCN" : "VCN_ENC",
1715 [AMD_IP_VCN_JPEG] = "VCN_JPG",
1716 [AMD_IP_VPE] = "VPE",
1717 };
1718
1719 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) {
1720 if (info->ip[i].num_queues) {
1721 fprintf(f, " IP %-7s %2u.%u \tqueues:%u \talign:%u \tpad_dw:0x%x\n", ip_string[i],
1722 info->ip[i].ver_major, info->ip[i].ver_minor, info->ip[i].num_queues,
1723 info->ip[i].ib_alignment, info->ip[i].ib_pad_dw_mask);
1724 }
1725 }
1726
1727 fprintf(f, "Identification:\n");
1728 if (info->pci.valid)
1729 fprintf(f, " pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", info->pci.domain, info->pci.bus,
1730 info->pci.dev, info->pci.func);
1731 else
1732 fprintf(f, " pci (domain:bus:dev.func): unknown\n");
1733 fprintf(f, " pci_id = 0x%x\n", info->pci_id);
1734 fprintf(f, " pci_rev_id = 0x%x\n", info->pci_rev_id);
1735 fprintf(f, " family = %i\n", info->family);
1736 fprintf(f, " gfx_level = %i\n", info->gfx_level);
1737 fprintf(f, " family_id = %i\n", info->family_id);
1738 fprintf(f, " chip_external_rev = %i\n", info->chip_external_rev);
1739 fprintf(f, " chip_rev = %i\n", info->chip_rev);
1740
1741 fprintf(f, "Flags:\n");
1742 fprintf(f, " family_overridden = %u\n", info->family_overridden);
1743 fprintf(f, " is_pro_graphics = %u\n", info->is_pro_graphics);
1744 fprintf(f, " has_graphics = %i\n", info->has_graphics);
1745 fprintf(f, " has_clear_state = %u\n", info->has_clear_state);
1746 fprintf(f, " has_distributed_tess = %u\n", info->has_distributed_tess);
1747 fprintf(f, " has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode);
1748 fprintf(f, " has_rbplus = %u\n", info->has_rbplus);
1749 fprintf(f, " rbplus_allowed = %u\n", info->rbplus_allowed);
1750 fprintf(f, " has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt);
1751 fprintf(f, " has_out_of_order_rast = %u\n", info->has_out_of_order_rast);
1752 fprintf(f, " cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory);
1753 fprintf(f, " has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug);
1754 fprintf(f, " has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug);
1755 fprintf(f, " has_small_prim_filter_sample_loc_bug = %i\n", info->has_small_prim_filter_sample_loc_bug);
1756 fprintf(f, " has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug);
1757 fprintf(f, " has_pops_missed_overlap_bug = %i\n", info->has_pops_missed_overlap_bug);
1758 fprintf(f, " has_32bit_predication = %i\n", info->has_32bit_predication);
1759 fprintf(f, " has_3d_cube_border_color_mipmap = %i\n", info->has_3d_cube_border_color_mipmap);
1760 fprintf(f, " has_image_opcodes = %i\n", info->has_image_opcodes);
1761 fprintf(f, " never_stop_sq_perf_counters = %i\n", info->never_stop_sq_perf_counters);
1762 fprintf(f, " has_sqtt_rb_harvest_bug = %i\n", info->has_sqtt_rb_harvest_bug);
1763 fprintf(f, " has_sqtt_auto_flush_mode_bug = %i\n", info->has_sqtt_auto_flush_mode_bug);
1764 fprintf(f, " never_send_perfcounter_stop = %i\n", info->never_send_perfcounter_stop);
1765 fprintf(f, " discardable_allows_big_page = %i\n", info->discardable_allows_big_page);
1766 fprintf(f, " has_taskmesh_indirect0_bug = %i\n", info->has_taskmesh_indirect0_bug);
1767 fprintf(f, " has_set_context_pairs_packed = %i\n", info->has_set_context_pairs_packed);
1768 fprintf(f, " has_set_sh_pairs_packed = %i\n", info->has_set_sh_pairs_packed);
1769 fprintf(f, " conformant_trunc_coord = %i\n", info->conformant_trunc_coord);
1770
1771 fprintf(f, "Display features:\n");
1772 fprintf(f, " use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned);
1773 fprintf(f, " use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit);
1774
1775 fprintf(f, "Memory info:\n");
1776 fprintf(f, " pte_fragment_size = %u\n", info->pte_fragment_size);
1777 fprintf(f, " gart_page_size = %u\n", info->gart_page_size);
1778 fprintf(f, " gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size_kb, 1024));
1779 fprintf(f, " vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size_kb, 1024));
1780 fprintf(f, " vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size_kb, 1024));
1781 fprintf(f, " vram_type = %i\n", info->vram_type);
1782 fprintf(f, " max_heap_size_kb = %i MB\n", (int)DIV_ROUND_UP(info->max_heap_size_kb, 1024));
1783 fprintf(f, " min_alloc_size = %u\n", info->min_alloc_size);
1784 fprintf(f, " address32_hi = 0x%x\n", info->address32_hi);
1785 fprintf(f, " has_dedicated_vram = %u\n", info->has_dedicated_vram);
1786 fprintf(f, " all_vram_visible = %u\n", info->all_vram_visible);
1787 fprintf(f, " max_tcc_blocks = %i\n", info->max_tcc_blocks);
1788 fprintf(f, " tcc_cache_line_size = %u\n", info->tcc_cache_line_size);
1789 fprintf(f, " tcc_rb_non_coherent = %u\n", info->tcc_rb_non_coherent);
1790 fprintf(f, " pc_lines = %u\n", info->pc_lines);
1791 fprintf(f, " lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup);
1792 fprintf(f, " lds_alloc_granularity = %i\n", info->lds_alloc_granularity);
1793 fprintf(f, " lds_encode_granularity = %i\n", info->lds_encode_granularity);
1794 fprintf(f, " max_memory_clock = %i MHz\n", info->memory_freq_mhz);
1795
1796 fprintf(f, "CP info:\n");
1797 fprintf(f, " gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
1798 fprintf(f, " me_fw_version = %i\n", info->me_fw_version);
1799 fprintf(f, " me_fw_feature = %i\n", info->me_fw_feature);
1800 fprintf(f, " mec_fw_version = %i\n", info->mec_fw_version);
1801 fprintf(f, " mec_fw_feature = %i\n", info->mec_fw_feature);
1802 fprintf(f, " pfp_fw_version = %i\n", info->pfp_fw_version);
1803 fprintf(f, " pfp_fw_feature = %i\n", info->pfp_fw_feature);
1804
1805 fprintf(f, "Multimedia info:\n");
1806 fprintf(f, " vce_encode = %u\n", info->ip[AMD_IP_VCE].num_queues);
1807
1808 if (info->family >= CHIP_NAVI31 || info->family == CHIP_GFX940)
1809 fprintf(f, " vcn_unified = %u\n", info->ip[AMD_IP_VCN_UNIFIED].num_queues);
1810 else {
1811 fprintf(f, " vcn_decode = %u\n", info->ip[AMD_IP_VCN_DEC].num_queues);
1812 fprintf(f, " vcn_encode = %u\n", info->ip[AMD_IP_VCN_ENC].num_queues);
1813 }
1814
1815 fprintf(f, " uvd_fw_version = %u\n", info->uvd_fw_version);
1816 fprintf(f, " vce_fw_version = %u\n", info->vce_fw_version);
1817 fprintf(f, " vce_harvest_config = %i\n", info->vce_harvest_config);
1818
1819 fprintf(f, "Kernel & winsys capabilities:\n");
1820 fprintf(f, " drm = %i.%i.%i\n", info->drm_major, info->drm_minor, info->drm_patchlevel);
1821 fprintf(f, " has_userptr = %i\n", info->has_userptr);
1822 fprintf(f, " has_timeline_syncobj = %u\n", info->has_timeline_syncobj);
1823 fprintf(f, " has_local_buffers = %u\n", info->has_local_buffers);
1824 fprintf(f, " has_bo_metadata = %u\n", info->has_bo_metadata);
1825 fprintf(f, " has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator);
1826 fprintf(f, " has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
1827 fprintf(f, " has_stable_pstate = %u\n", info->has_stable_pstate);
1828 fprintf(f, " has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency);
1829 fprintf(f, " has_gang_submit = %u\n", info->has_gang_submit);
1830 fprintf(f, " has_gpuvm_fault_query = %u\n", info->has_gpuvm_fault_query);
1831 fprintf(f, " register_shadowing_required = %u\n", info->register_shadowing_required);
1832 fprintf(f, " has_fw_based_shadowing = %u\n", info->has_fw_based_shadowing);
1833 if (info->has_fw_based_shadowing) {
1834 fprintf(f, " * shadow size: %u (alignment: %u)\n",
1835 info->fw_based_mcbp.shadow_size,
1836 info->fw_based_mcbp.shadow_alignment);
1837 fprintf(f, " * csa size: %u (alignment: %u)\n",
1838 info->fw_based_mcbp.csa_size,
1839 info->fw_based_mcbp.csa_alignment);
1840 }
1841
1842 fprintf(f, " has_tmz_support = %u\n", info->has_tmz_support);
1843 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) {
1844 if (info->max_submitted_ibs[i]) {
1845 fprintf(f, " IP %-7s max_submitted_ibs = %u\n", ip_string[i],
1846 info->max_submitted_ibs[i]);
1847 }
1848 }
1849 fprintf(f, " kernel_has_modifiers = %u\n", info->kernel_has_modifiers);
1850 fprintf(f, " uses_kernel_cu_mask = %u\n", info->uses_kernel_cu_mask);
1851
1852 fprintf(f, "Shader core info:\n");
1853 for (unsigned i = 0; i < info->max_se; i++) {
1854 for (unsigned j = 0; j < info->max_sa_per_se; j++) {
1855 fprintf(f, " cu_mask[SE%u][SA%u] = 0x%x \t(%u)\tCU_EN = 0x%x\n", i, j,
1856 info->cu_mask[i][j], util_bitcount(info->cu_mask[i][j]),
1857 info->spi_cu_en & BITFIELD_MASK(util_bitcount(info->cu_mask[i][j])));
1858 }
1859 }
1860 fprintf(f, " spi_cu_en_has_effect = %i\n", info->spi_cu_en_has_effect);
1861 fprintf(f, " max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa);
1862 fprintf(f, " min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa);
1863 fprintf(f, " max_se = %i\n", info->max_se);
1864 fprintf(f, " max_sa_per_se = %i\n", info->max_sa_per_se);
1865 fprintf(f, " num_cu_per_sh = %i\n", info->num_cu_per_sh);
1866 fprintf(f, " max_waves_per_simd = %i\n", info->max_waves_per_simd);
1867 fprintf(f, " num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd);
1868 fprintf(f, " num_physical_wave64_vgprs_per_simd = %i\n",
1869 info->num_physical_wave64_vgprs_per_simd);
1870 fprintf(f, " num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit);
1871 fprintf(f, " min_sgpr_alloc = %i\n", info->min_sgpr_alloc);
1872 fprintf(f, " max_sgpr_alloc = %i\n", info->max_sgpr_alloc);
1873 fprintf(f, " sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity);
1874 fprintf(f, " min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc);
1875 fprintf(f, " max_vgpr_alloc = %i\n", info->max_vgpr_alloc);
1876 fprintf(f, " wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity);
1877 fprintf(f, " max_scratch_waves = %i\n", info->max_scratch_waves);
1878 fprintf(f, " attribute_ring_size_per_se = %u\n", info->attribute_ring_size_per_se);
1879
1880 fprintf(f, "Render backend info:\n");
1881 fprintf(f, " pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
1882 fprintf(f, " max_render_backends = %i\n", info->max_render_backends);
1883 fprintf(f, " num_tile_pipes = %i\n", info->num_tile_pipes);
1884 fprintf(f, " pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes);
1885 fprintf(f, " enabled_rb_mask = 0x%" PRIx64 "\n", info->enabled_rb_mask);
1886 fprintf(f, " max_alignment = %u\n", (unsigned)info->max_alignment);
1887 fprintf(f, " pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count);
1888
1889 fprintf(f, "GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config);
1890 if (info->gfx_level >= GFX10) {
1891 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
1892 fprintf(f, " pipe_interleave_size = %u\n",
1893 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
1894 fprintf(f, " max_compressed_frags = %u\n",
1895 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
1896 if (info->gfx_level >= GFX10_3)
1897 fprintf(f, " num_pkrs = %u\n", 1 << G_0098F8_NUM_PKRS(info->gb_addr_config));
1898 } else if (info->gfx_level == GFX9) {
1899 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
1900 fprintf(f, " pipe_interleave_size = %u\n",
1901 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
1902 fprintf(f, " max_compressed_frags = %u\n",
1903 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
1904 fprintf(f, " bank_interleave_size = %u\n",
1905 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
1906 fprintf(f, " num_banks = %u\n", 1 << G_0098F8_NUM_BANKS(info->gb_addr_config));
1907 fprintf(f, " shader_engine_tile_size = %u\n",
1908 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
1909 fprintf(f, " num_shader_engines = %u\n",
1910 1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config));
1911 fprintf(f, " num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config));
1912 fprintf(f, " multi_gpu_tile_size = %u (raw)\n",
1913 G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
1914 fprintf(f, " num_rb_per_se = %u\n", 1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config));
1915 fprintf(f, " row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
1916 fprintf(f, " num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
1917 fprintf(f, " se_enable = %u (raw)\n", G_0098F8_SE_ENABLE(info->gb_addr_config));
1918 } else {
1919 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
1920 fprintf(f, " pipe_interleave_size = %u\n",
1921 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config));
1922 fprintf(f, " bank_interleave_size = %u\n",
1923 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
1924 fprintf(f, " num_shader_engines = %u\n",
1925 1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config));
1926 fprintf(f, " shader_engine_tile_size = %u\n",
1927 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
1928 fprintf(f, " num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config));
1929 fprintf(f, " multi_gpu_tile_size = %u (raw)\n",
1930 G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
1931 fprintf(f, " row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
1932 fprintf(f, " num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
1933 }
1934 }
1935
ac_get_gs_table_depth(enum amd_gfx_level gfx_level,enum radeon_family family)1936 int ac_get_gs_table_depth(enum amd_gfx_level gfx_level, enum radeon_family family)
1937 {
1938 if (gfx_level >= GFX9)
1939 return -1;
1940
1941 switch (family) {
1942 case CHIP_OLAND:
1943 case CHIP_HAINAN:
1944 case CHIP_KAVERI:
1945 case CHIP_KABINI:
1946 case CHIP_ICELAND:
1947 case CHIP_CARRIZO:
1948 case CHIP_STONEY:
1949 return 16;
1950 case CHIP_TAHITI:
1951 case CHIP_PITCAIRN:
1952 case CHIP_VERDE:
1953 case CHIP_BONAIRE:
1954 case CHIP_HAWAII:
1955 case CHIP_TONGA:
1956 case CHIP_FIJI:
1957 case CHIP_POLARIS10:
1958 case CHIP_POLARIS11:
1959 case CHIP_POLARIS12:
1960 case CHIP_VEGAM:
1961 return 32;
1962 default:
1963 unreachable("Unknown GPU");
1964 }
1965 }
1966
ac_get_raster_config(const struct radeon_info * info,uint32_t * raster_config_p,uint32_t * raster_config_1_p,uint32_t * se_tile_repeat_p)1967 void ac_get_raster_config(const struct radeon_info *info, uint32_t *raster_config_p,
1968 uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p)
1969 {
1970 unsigned raster_config, raster_config_1, se_tile_repeat;
1971
1972 switch (info->family) {
1973 /* 1 SE / 1 RB */
1974 case CHIP_HAINAN:
1975 case CHIP_KABINI:
1976 case CHIP_STONEY:
1977 raster_config = 0x00000000;
1978 raster_config_1 = 0x00000000;
1979 break;
1980 /* 1 SE / 4 RBs */
1981 case CHIP_VERDE:
1982 raster_config = 0x0000124a;
1983 raster_config_1 = 0x00000000;
1984 break;
1985 /* 1 SE / 2 RBs (Oland is special) */
1986 case CHIP_OLAND:
1987 raster_config = 0x00000082;
1988 raster_config_1 = 0x00000000;
1989 break;
1990 /* 1 SE / 2 RBs */
1991 case CHIP_KAVERI:
1992 case CHIP_ICELAND:
1993 case CHIP_CARRIZO:
1994 raster_config = 0x00000002;
1995 raster_config_1 = 0x00000000;
1996 break;
1997 /* 2 SEs / 4 RBs */
1998 case CHIP_BONAIRE:
1999 case CHIP_POLARIS11:
2000 case CHIP_POLARIS12:
2001 raster_config = 0x16000012;
2002 raster_config_1 = 0x00000000;
2003 break;
2004 /* 2 SEs / 8 RBs */
2005 case CHIP_TAHITI:
2006 case CHIP_PITCAIRN:
2007 raster_config = 0x2a00126a;
2008 raster_config_1 = 0x00000000;
2009 break;
2010 /* 4 SEs / 8 RBs */
2011 case CHIP_TONGA:
2012 case CHIP_POLARIS10:
2013 raster_config = 0x16000012;
2014 raster_config_1 = 0x0000002a;
2015 break;
2016 /* 4 SEs / 16 RBs */
2017 case CHIP_HAWAII:
2018 case CHIP_FIJI:
2019 case CHIP_VEGAM:
2020 raster_config = 0x3a00161a;
2021 raster_config_1 = 0x0000002e;
2022 break;
2023 default:
2024 fprintf(stderr, "ac: Unknown GPU, using 0 for raster_config\n");
2025 raster_config = 0x00000000;
2026 raster_config_1 = 0x00000000;
2027 break;
2028 }
2029
2030 /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it.
2031 * This decreases performance by up to 50% when the RB is the bottleneck.
2032 */
2033 if (info->family == CHIP_KAVERI && !info->is_amdgpu)
2034 raster_config = 0x00000000;
2035
2036 /* Fiji: Old kernels have incorrect tiling config. This decreases
2037 * RB performance by 25%. (it disables 1 RB in the second packer)
2038 */
2039 if (info->family == CHIP_FIJI && info->cik_macrotile_mode_array[0] == 0x000000e8) {
2040 raster_config = 0x16000012;
2041 raster_config_1 = 0x0000002a;
2042 }
2043
2044 unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config);
2045 unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config);
2046
2047 /* I don't know how to calculate this, though this is probably a good guess. */
2048 se_tile_repeat = MAX2(se_width, se_height) * info->max_se;
2049
2050 *raster_config_p = raster_config;
2051 *raster_config_1_p = raster_config_1;
2052 if (se_tile_repeat_p)
2053 *se_tile_repeat_p = se_tile_repeat;
2054 }
2055
ac_get_harvested_configs(const struct radeon_info * info,unsigned raster_config,unsigned * cik_raster_config_1_p,unsigned * raster_config_se)2056 void ac_get_harvested_configs(const struct radeon_info *info, unsigned raster_config,
2057 unsigned *cik_raster_config_1_p, unsigned *raster_config_se)
2058 {
2059 unsigned sh_per_se = MAX2(info->max_sa_per_se, 1);
2060 unsigned num_se = MAX2(info->max_se, 1);
2061 unsigned rb_mask = info->enabled_rb_mask;
2062 unsigned num_rb = MIN2(info->max_render_backends, 16);
2063 unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
2064 unsigned rb_per_se = num_rb / num_se;
2065 unsigned se_mask[4];
2066 unsigned se;
2067
2068 se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
2069 se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
2070 se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
2071 se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
2072
2073 assert(num_se == 1 || num_se == 2 || num_se == 4);
2074 assert(sh_per_se == 1 || sh_per_se == 2);
2075 assert(rb_per_pkr == 1 || rb_per_pkr == 2);
2076
2077 if (info->gfx_level >= GFX7) {
2078 unsigned raster_config_1 = *cik_raster_config_1_p;
2079 if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || (!se_mask[2] && !se_mask[3]))) {
2080 raster_config_1 &= C_028354_SE_PAIR_MAP;
2081
2082 if (!se_mask[0] && !se_mask[1]) {
2083 raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
2084 } else {
2085 raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
2086 }
2087 *cik_raster_config_1_p = raster_config_1;
2088 }
2089 }
2090
2091 for (se = 0; se < num_se; se++) {
2092 unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
2093 unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
2094 int idx = (se / 2) * 2;
2095
2096 raster_config_se[se] = raster_config;
2097 if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
2098 raster_config_se[se] &= C_028350_SE_MAP;
2099
2100 if (!se_mask[idx]) {
2101 raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
2102 } else {
2103 raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
2104 }
2105 }
2106
2107 pkr0_mask &= rb_mask;
2108 pkr1_mask &= rb_mask;
2109 if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
2110 raster_config_se[se] &= C_028350_PKR_MAP;
2111
2112 if (!pkr0_mask) {
2113 raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
2114 } else {
2115 raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
2116 }
2117 }
2118
2119 if (rb_per_se >= 2) {
2120 unsigned rb0_mask = 1 << (se * rb_per_se);
2121 unsigned rb1_mask = rb0_mask << 1;
2122
2123 rb0_mask &= rb_mask;
2124 rb1_mask &= rb_mask;
2125 if (!rb0_mask || !rb1_mask) {
2126 raster_config_se[se] &= C_028350_RB_MAP_PKR0;
2127
2128 if (!rb0_mask) {
2129 raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
2130 } else {
2131 raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
2132 }
2133 }
2134
2135 if (rb_per_se > 2) {
2136 rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
2137 rb1_mask = rb0_mask << 1;
2138 rb0_mask &= rb_mask;
2139 rb1_mask &= rb_mask;
2140 if (!rb0_mask || !rb1_mask) {
2141 raster_config_se[se] &= C_028350_RB_MAP_PKR1;
2142
2143 if (!rb0_mask) {
2144 raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
2145 } else {
2146 raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
2147 }
2148 }
2149 }
2150 }
2151 }
2152 }
2153
2154 unsigned
ac_get_compute_resource_limits(const struct radeon_info * info,unsigned waves_per_threadgroup,unsigned max_waves_per_sh,unsigned threadgroups_per_cu)2155 ac_get_compute_resource_limits(const struct radeon_info *info, unsigned waves_per_threadgroup,
2156 unsigned max_waves_per_sh, unsigned threadgroups_per_cu)
2157 {
2158 unsigned compute_resource_limits = S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
2159
2160 if (info->gfx_level >= GFX7) {
2161 unsigned num_cu_per_se = info->num_cu / info->num_se;
2162
2163 /* Gfx9 should set the limit to max instead of 0 to fix high priority compute. */
2164 if (info->gfx_level == GFX9 && !max_waves_per_sh) {
2165 max_waves_per_sh = info->max_good_cu_per_sa * info->num_simd_per_compute_unit *
2166 info->max_waves_per_simd;
2167 }
2168
2169 /* Force even distribution on all SIMDs in CU if the workgroup
2170 * size is 64. This has shown some good improvements if # of CUs
2171 * per SE is not a multiple of 4.
2172 */
2173 if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
2174 compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
2175
2176 assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8);
2177 compute_resource_limits |=
2178 S_00B854_WAVES_PER_SH(max_waves_per_sh) | S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1);
2179 } else {
2180 /* GFX6 */
2181 if (max_waves_per_sh) {
2182 unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16);
2183 compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16);
2184 }
2185 }
2186 return compute_resource_limits;
2187 }
2188
ac_get_hs_info(const struct radeon_info * info,struct ac_hs_info * hs)2189 void ac_get_hs_info(const struct radeon_info *info,
2190 struct ac_hs_info *hs)
2191 {
2192 bool double_offchip_buffers = info->gfx_level >= GFX7 &&
2193 info->family != CHIP_CARRIZO &&
2194 info->family != CHIP_STONEY;
2195 unsigned max_offchip_buffers_per_se;
2196 unsigned max_offchip_buffers;
2197 unsigned offchip_granularity;
2198 unsigned hs_offchip_param;
2199
2200 hs->tess_offchip_block_dw_size =
2201 info->family == CHIP_HAWAII ? 4096 : 8192;
2202
2203 /*
2204 * Per RadeonSI:
2205 * This must be one less than the maximum number due to a hw limitation.
2206 * Various hardware bugs need this.
2207 *
2208 * Per AMDVLK:
2209 * Vega10 should limit max_offchip_buffers to 508 (4 * 127).
2210 * Gfx7 should limit max_offchip_buffers to 508
2211 * Gfx6 should limit max_offchip_buffers to 126 (2 * 63)
2212 *
2213 * Follow AMDVLK here.
2214 */
2215 if (info->gfx_level >= GFX11) {
2216 max_offchip_buffers_per_se = 256; /* TODO: we could decrease this to reduce memory/cache usage */
2217 } else if (info->gfx_level >= GFX10) {
2218 max_offchip_buffers_per_se = 128;
2219 } else if (info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) {
2220 /* Only certain chips can use the maximum value. */
2221 max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
2222 } else {
2223 max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
2224 }
2225
2226 max_offchip_buffers = max_offchip_buffers_per_se * info->max_se;
2227
2228 /* Hawaii has a bug with offchip buffers > 256 that can be worked
2229 * around by setting 4K granularity.
2230 */
2231 if (hs->tess_offchip_block_dw_size == 4096) {
2232 assert(info->family == CHIP_HAWAII);
2233 offchip_granularity = V_03093C_X_4K_DWORDS;
2234 } else {
2235 assert(hs->tess_offchip_block_dw_size == 8192);
2236 offchip_granularity = V_03093C_X_8K_DWORDS;
2237 }
2238
2239 switch (info->gfx_level) {
2240 case GFX6:
2241 max_offchip_buffers = MIN2(max_offchip_buffers, 126);
2242 break;
2243 case GFX7:
2244 case GFX8:
2245 case GFX9:
2246 max_offchip_buffers = MIN2(max_offchip_buffers, 508);
2247 break;
2248 case GFX10:
2249 break;
2250 default:
2251 break;
2252 }
2253
2254 hs->max_offchip_buffers = max_offchip_buffers;
2255
2256 if (info->gfx_level >= GFX11) {
2257 /* OFFCHIP_BUFFERING is per SE. */
2258 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers_per_se - 1) |
2259 S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity);
2260 } else if (info->gfx_level >= GFX10_3) {
2261 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers - 1) |
2262 S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity);
2263 } else if (info->gfx_level >= GFX7) {
2264 if (info->gfx_level >= GFX8)
2265 --max_offchip_buffers;
2266 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(max_offchip_buffers) |
2267 S_03093C_OFFCHIP_GRANULARITY_GFX7(offchip_granularity);
2268 } else {
2269 hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
2270 }
2271
2272 hs->hs_offchip_param = hs_offchip_param;
2273
2274 hs->tess_factor_ring_size = 48 * 1024 * info->max_se;
2275 hs->tess_offchip_ring_offset = align(hs->tess_factor_ring_size, 64 * 1024);
2276 hs->tess_offchip_ring_size = hs->max_offchip_buffers * hs->tess_offchip_block_dw_size * 4;
2277 }
2278
get_task_num_entries(enum radeon_family fam)2279 static uint16_t get_task_num_entries(enum radeon_family fam)
2280 {
2281 /* Number of task shader ring entries. Needs to be a power of two.
2282 * Use a low number on smaller chips so we don't waste space,
2283 * but keep it high on bigger chips so it doesn't inhibit parallelism.
2284 *
2285 * This number is compiled into task/mesh shaders as a constant.
2286 * In order to ensure this works fine with the shader cache, we must
2287 * base this decision on the chip family, not the number of CUs in
2288 * the current GPU. (So, the cache remains consistent for all
2289 * chips in the same family.)
2290 */
2291 switch (fam) {
2292 case CHIP_VANGOGH:
2293 case CHIP_NAVI24:
2294 case CHIP_REMBRANDT:
2295 return 256;
2296 case CHIP_NAVI21:
2297 case CHIP_NAVI22:
2298 case CHIP_NAVI23:
2299 default:
2300 return 1024;
2301 }
2302 }
2303
ac_get_task_info(const struct radeon_info * info,struct ac_task_info * task_info)2304 void ac_get_task_info(const struct radeon_info *info,
2305 struct ac_task_info *task_info)
2306 {
2307 const uint16_t num_entries = get_task_num_entries(info->family);
2308 const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES;
2309 const uint32_t payload_ring_bytes = num_entries * AC_TASK_PAYLOAD_ENTRY_BYTES;
2310
2311 /* Ensure that the addresses of each ring are 256 byte aligned. */
2312 task_info->num_entries = num_entries;
2313 task_info->draw_ring_offset = ALIGN(AC_TASK_CTRLBUF_BYTES, 256);
2314 task_info->payload_ring_offset = ALIGN(task_info->draw_ring_offset + draw_ring_bytes, 256);
2315 task_info->bo_size_bytes = task_info->payload_ring_offset + payload_ring_bytes;
2316 }
2317
ac_memory_ops_per_clock(uint32_t vram_type)2318 uint32_t ac_memory_ops_per_clock(uint32_t vram_type)
2319 {
2320 /* Based on MemoryOpsPerClockTable from PAL. */
2321 switch (vram_type) {
2322 case AMDGPU_VRAM_TYPE_GDDR1:
2323 case AMDGPU_VRAM_TYPE_GDDR3: /* last in low-end Evergreen */
2324 case AMDGPU_VRAM_TYPE_GDDR4: /* last in R7xx, not used much */
2325 case AMDGPU_VRAM_TYPE_UNKNOWN:
2326 default:
2327 return 0;
2328 case AMDGPU_VRAM_TYPE_DDR2:
2329 case AMDGPU_VRAM_TYPE_DDR3:
2330 case AMDGPU_VRAM_TYPE_DDR4:
2331 case AMDGPU_VRAM_TYPE_LPDDR4:
2332 case AMDGPU_VRAM_TYPE_HBM: /* same for HBM2 and HBM3 */
2333 return 2;
2334 case AMDGPU_VRAM_TYPE_DDR5:
2335 case AMDGPU_VRAM_TYPE_LPDDR5:
2336 case AMDGPU_VRAM_TYPE_GDDR5: /* last in Polaris and low-end Navi14 */
2337 return 4;
2338 case AMDGPU_VRAM_TYPE_GDDR6:
2339 return 16;
2340 }
2341 }
2342