• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  */
6 
7 #ifndef FREEDRENO_DEVICE_INFO_H
8 #define FREEDRENO_DEVICE_INFO_H
9 
10 #include <assert.h>
11 #include <stdbool.h>
12 #include <stdint.h>
13 
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17 
18 /**
19  * Freedreno hardware description and quirks
20  */
21 
22 struct fd_dev_info {
23    uint8_t chip;
24 
25    /* alignment for size of tiles */
26    uint32_t tile_align_w, tile_align_h;
27    /* gmem load/store granularity */
28    uint32_t gmem_align_w, gmem_align_h;
29    /* max tile size */
30    uint32_t tile_max_w, tile_max_h;
31 
32    uint32_t num_vsc_pipes;
33 
34    uint32_t cs_shared_mem_size;
35 
36    int wave_granularity;
37 
38    /* These are fallback values that should match what drm/msm programs, for
39     * kernels that don't support returning them. Newer devices should not set
40     * them and just use the value from the kernel.
41     */
42    uint32_t highest_bank_bit;
43    uint32_t ubwc_swizzle;
44    uint32_t macrotile_mode;
45 
46    /* Information for private memory calculations */
47    uint32_t fibers_per_sp;
48 
49    uint32_t threadsize_base;
50 
51    uint32_t max_waves;
52 
53    /* number of CCU is always equal to the number of SP */
54    union {
55       uint32_t num_sp_cores;
56       uint32_t num_ccu;
57    };
58 
59    struct {
60       uint32_t reg_size_vec4;
61 
62       /* The size (in instrlen units (128 bytes)) of instruction cache where
63        * we preload a shader. Loading more than this could trigger a hang
64        * on gen3 and later.
65        */
66       uint32_t instr_cache_size;
67 
68       bool has_hw_multiview;
69 
70       bool has_fs_tex_prefetch;
71 
72       /* Whether the PC_MULTIVIEW_MASK register exists. */
73       bool supports_multiview_mask;
74 
75       /* info for setting RB_CCU_CNTL */
76       bool concurrent_resolve;
77       bool has_z24uint_s8uint;
78 
79       bool tess_use_shared;
80 
81       /* Does the hw support GL_QCOM_shading_rate? */
82       bool has_legacy_pipeline_shading_rate;
83 
84       /* Whether a 16-bit descriptor can be used */
85       bool storage_16bit;
86 
87       /* The latest known a630_sqe.fw fails to wait for WFI before
88        * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
89        * so we have to fall back to CP_WAIT_FOR_ME except for a650
90        * which has a fixed firmware.
91        *
92        * TODO: There may be newer a630_sqe.fw released in the future
93        * which fixes this, if so we should detect it and avoid this
94        * workaround.  Once we have uapi to query fw version, we can
95        * replace this with minimum fw version.
96        */
97       bool indirect_draw_wfm_quirk;
98 
99       /* On some GPUs, the depth test needs to be enabled when the
100        * depth bounds test is enabled and the depth attachment uses UBWC.
101        */
102       bool depth_bounds_require_depth_test_quirk;
103 
104       bool has_tex_filter_cubic;
105 
106       /* The blob driver does not support SEPARATE_RECONSTRUCTION_FILTER_BIT
107        * before a6xx_gen3.  It still sets CHROMA_LINEAR bit according to
108        * chromaFilter, but the bit has no effect before a6xx_gen3.
109        */
110       bool has_separate_chroma_filter;
111 
112       bool has_sample_locations;
113 
114       /* The firmware on newer a6xx drops CP_REG_WRITE support as we
115        * can now use direct register writes for these regs.
116        */
117       bool has_cp_reg_write;
118 
119       bool has_8bpp_ubwc;
120 
121       bool has_lpac;
122 
123       bool has_getfiberid;
124 
125       bool has_dp2acc;
126       bool has_dp4acc;
127 
128       /* LRZ fast-clear works on all gens, however blob disables it on
129        * gen1 and gen2. We also elect to disable fast-clear on these gens
130        * because for close to none gains it adds complexity and seem to work
131        * a bit differently from gen3+. Which creates at least one edge case:
132        * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
133        * the fast-clear value is undefined. For details see
134        * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
135        */
136       bool enable_lrz_fast_clear;
137       bool has_lrz_dir_tracking;
138       bool lrz_track_quirk;
139       bool has_lrz_feedback;
140 
141       /* Some generations have a bit to add the multiview index to the
142        * viewport index, which lets us implement different scaling for
143        * different views.
144        */
145       bool has_per_view_viewport;
146       bool has_gmem_fast_clear;
147 
148       /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
149        * in sysmem rendering. */
150       uint32_t sysmem_per_ccu_depth_cache_size;
151       uint32_t sysmem_per_ccu_color_cache_size;
152       /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
153        * which require color cache (non-BLIT event case).
154        * The size is expressed as a fraction of ccu cache used by sysmem
155        * rendering. If a GMEM resolve requires color cache, the driver needs
156        * to make sure it will not overwrite pixel data in GMEM that is still
157        * needed.
158        */
159       /* see enum a6xx_ccu_cache_size */
160       uint32_t gmem_ccu_color_cache_fraction;
161 
162       /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
163       uint32_t prim_alloc_threshold;
164 
165       uint32_t vs_max_inputs_count;
166 
167       bool supports_double_threadsize;
168 
169       bool has_sampler_minmax;
170 
171       bool broken_ds_ubwc_quirk;
172 
173       /* See ir3_compiler::has_scalar_alu. */
174       bool has_scalar_alu;
175       /* See ir3_compiler::has_early_preamble. */
176       bool has_early_preamble;
177 
178       bool has_isam_v;
179       bool has_ssbo_imm_offsets;
180 
181       /* Whether writing to UBWC attachment and reading the same image as input
182        * attachment or as a texture reads correct values from the image.
183        * If this is false, we may read stale values from the flag buffer,
184        * thus reading incorrect values from the image.
185        * Happens with VK_EXT_attachment_feedback_loop_layout.
186        */
187       bool has_coherent_ubwc_flag_caches;
188 
189       bool has_attachment_shading_rate;
190 
191       /* Whether mipmaps below certain threshold can use LINEAR tiling when higher
192        * levels use UBWC,
193        */
194       bool has_ubwc_linear_mipmap_fallback;
195 
196       /* Whether 4 nops are needed after the second pred[tf] of a
197        * pred[tf]/pred[ft] pair to work around a hardware issue.
198        */
199       bool predtf_nop_quirk;
200 
201       /* Whether 6 nops are needed after prede to work around a hardware
202        * issue.
203        */
204       bool prede_nop_quirk;
205 
206       /* Whether the sad instruction (iadd3) is supported. */
207       bool has_sad;
208 
209       struct {
210          uint32_t PC_POWER_CNTL;
211          uint32_t TPL1_DBG_ECO_CNTL;
212          uint32_t GRAS_DBG_ECO_CNTL;
213          uint32_t SP_CHICKEN_BITS;
214          uint32_t UCHE_CLIENT_PF;
215          uint32_t PC_MODE_CNTL;
216          uint32_t SP_DBG_ECO_CNTL;
217          uint32_t RB_DBG_ECO_CNTL;
218          uint32_t RB_DBG_ECO_CNTL_blit;
219          uint32_t HLSQ_DBG_ECO_CNTL;
220          uint32_t RB_UNKNOWN_8E01;
221          uint32_t VPC_DBG_ECO_CNTL;
222          uint32_t UCHE_UNKNOWN_0E12;
223 
224          uint32_t RB_UNKNOWN_8E06;
225       } magic;
226 
227       struct {
228             uint32_t reg;
229             uint32_t value;
230       } magic_raw[64];
231 
232       /* maximum number of descriptor sets */
233       uint32_t max_sets;
234 
235       float line_width_min;
236       float line_width_max;
237    } a6xx;
238 
239    struct {
240       /* stsc may need to be done twice for the same range to workaround
241        * _something_, observed in blob's disassembly.
242        */
243       bool stsc_duplication_quirk;
244 
245       /* Whether there is CP_EVENT_WRITE7::WRITE_SAMPLE_COUNT */
246       bool has_event_write_sample_count;
247 
248       bool has_64b_ssbo_atomics;
249 
250       /* Blob executes a special compute dispatch at the start of each
251        * command buffers. We copy this dispatch as is.
252        */
253       bool cmdbuf_start_a725_quirk;
254 
255       bool load_inline_uniforms_via_preamble_ldgk;
256       bool load_shader_consts_via_preamble;
257 
258       bool has_gmem_vpc_attr_buf;
259       /* Size of buffer in gmem for VPC attributes */
260       uint32_t sysmem_vpc_attr_buf_size;
261       uint32_t gmem_vpc_attr_buf_size;
262 
263       /* Whether UBWC is supported on all IBOs. Prior to this, only readonly
264        * or writeonly IBOs could use UBWC and mixing reads and writes was not
265        * permitted.
266        */
267       bool supports_ibo_ubwc;
268 
269       /* Whether the UBWC fast-clear values for snorn, unorm, and int formats
270        * are the same. This is the case from a740 onwards. These formats were
271        * already otherwise UBWC-compatible, so this means that they are now
272        * fully compatible.
273        */
274       bool ubwc_unorm_snorm_int_compatible;
275 
276       /* Having zero consts in one FS may corrupt consts in follow up FSs,
277        * on such GPUs blob never has zero consts in FS. The mechanism of
278        * corruption is unknown.
279        */
280       bool fs_must_have_non_zero_constlen_quirk;
281 
282       /* On a750 there is a hardware bug where certain VPC sizes in a GS with
283        * an input primitive type that is a triangle with adjacency can hang
284        * with a high enough vertex count.
285        */
286       bool gs_vpc_adjacency_quirk;
287 
288       /* On a740 TPL1_DBG_ECO_CNTL1.TP_UBWC_FLAG_HINT must be the same between
289        * all drivers in the system, somehow having different values affects
290        * BLIT_OP_SCALE. We cannot automatically match blob's value, so the
291        * best thing we could do is a toggle.
292        */
293       bool enable_tp_ubwc_flag_hint;
294 
295       bool storage_8bit;
296 
297       /* A750+ added a special flag that allows HW to correctly interpret UBWC, including
298        * UBWC fast-clear when casting image to a different format permitted by Vulkan.
299        * So it's possible to have UBWC enabled for image that has e.g. R32_UINT and
300        * R8G8B8A8_UNORM in the mutable formats list.
301        */
302       bool ubwc_all_formats_compatible;
303 
304       bool has_compliant_dp4acc;
305 
306       /* Whether a single clear blit could be used for both sysmem and gmem.*/
307       bool has_generic_clear;
308 
309       /* Whether r8g8 UBWC fast-clear work correctly. */
310       bool r8g8_faulty_fast_clear_quirk;
311 
312       /* a750 has a bug where writing and then reading a UBWC-compressed IBO
313        * requires flushing UCHE. This is reproducible in many CTS tests, for
314        * example dEQP-VK.image.load_store.with_format.2d.*.
315        */
316       bool ubwc_coherency_quirk;
317 
318       /* Whether CP_ALWAYS_ON_COUNTER only resets on device loss rather than
319        * on every suspend/resume.
320        */
321       bool has_persistent_counter;
322 
323       /* Whether only 256 vec4 constants are available for compute */
324       bool compute_constlen_quirk;
325 
326       bool has_primitive_shading_rate;
327 
328       /* A7XX gen1 and gen2 seem to require declaring SAMPLEMASK input
329        * for fragment shading rate to be read correctly.
330        * This workaround was seen in the prop driver v512.762.12.
331        */
332       bool reading_shading_rate_requires_smask_quirk;
333    } a7xx;
334 };
335 
336 struct fd_dev_id {
337    uint32_t gpu_id;
338    uint64_t chip_id;
339 };
340 
341 /**
342  * Note that gpu-id should be considered deprecated.  For newer a6xx, if
343  * there is no gpu-id, this attempts to generate one from the chip-id.
344  * But that may not work forever, so avoid depending on this for newer
345  * gens
346  */
347 static inline uint32_t
fd_dev_gpu_id(const struct fd_dev_id * id)348 fd_dev_gpu_id(const struct fd_dev_id *id)
349 {
350    assert(id->gpu_id || id->chip_id);
351    if (!id->gpu_id) {
352       return ((id->chip_id >> 24) & 0xff) * 100 +
353              ((id->chip_id >> 16) & 0xff) * 10 +
354              ((id->chip_id >>  8) & 0xff);
355 
356    }
357    return id->gpu_id;
358 }
359 
360 /* Unmodified dev info as defined in freedreno_devices.py */
361 const struct fd_dev_info *fd_dev_info_raw(const struct fd_dev_id *id);
362 
363 /* Final dev info with dbg options and everything else applied.  */
364 const struct fd_dev_info fd_dev_info(const struct fd_dev_id *id);
365 
366 const struct fd_dev_info *fd_dev_info_raw_by_name(const char *name);
367 
368 static uint8_t
fd_dev_gen(const struct fd_dev_id * id)369 fd_dev_gen(const struct fd_dev_id *id)
370 {
371    return fd_dev_info_raw(id)->chip;
372 }
373 
374 static inline bool
fd_dev_64b(const struct fd_dev_id * id)375 fd_dev_64b(const struct fd_dev_id *id)
376 {
377    return fd_dev_gen(id) >= 5;
378 }
379 
380 /* per CCU GMEM amount reserved for depth cache for direct rendering */
381 #define A6XX_CCU_DEPTH_SIZE (64 * 1024)
382 /* per CCU GMEM amount reserved for color cache used by GMEM resolves
383  * which require color cache (non-BLIT event case).
384  * this is smaller than what is normally used by direct rendering
385  * (RB_CCU_CNTL.GMEM bit enables this smaller size)
386  * if a GMEM resolve requires color cache, the driver needs to make sure
387  * it will not overwrite pixel data in GMEM that is still needed
388  */
389 #define A6XX_CCU_GMEM_COLOR_SIZE (16 * 1024)
390 
391 const char * fd_dev_name(const struct fd_dev_id *id);
392 
393 void
394 fd_dev_info_apply_dbg_options(struct fd_dev_info *info);
395 
396 #ifdef __cplusplus
397 } /* end of extern "C" */
398 #endif
399 
400 #endif /* FREEDRENO_DEVICE_INFO_H */
401