1 /*
2 * Copyright © 2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 */
6
7 #ifndef FREEDRENO_DEVICE_INFO_H
8 #define FREEDRENO_DEVICE_INFO_H
9
10 #include <assert.h>
11 #include <stdbool.h>
12 #include <stdint.h>
13
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17
18 /**
19 * Freedreno hardware description and quirks
20 */
21
22 struct fd_dev_info {
23 uint8_t chip;
24
25 /* alignment for size of tiles */
26 uint32_t tile_align_w, tile_align_h;
27 /* gmem load/store granularity */
28 uint32_t gmem_align_w, gmem_align_h;
29 /* max tile size */
30 uint32_t tile_max_w, tile_max_h;
31
32 uint32_t num_vsc_pipes;
33
34 uint32_t cs_shared_mem_size;
35
36 int wave_granularity;
37
38 /* These are fallback values that should match what drm/msm programs, for
39 * kernels that don't support returning them. Newer devices should not set
40 * them and just use the value from the kernel.
41 */
42 uint32_t highest_bank_bit;
43 uint32_t ubwc_swizzle;
44 uint32_t macrotile_mode;
45
46 /* Information for private memory calculations */
47 uint32_t fibers_per_sp;
48
49 uint32_t threadsize_base;
50
51 uint32_t max_waves;
52
53 /* number of CCU is always equal to the number of SP */
54 union {
55 uint32_t num_sp_cores;
56 uint32_t num_ccu;
57 };
58
59 struct {
60 uint32_t reg_size_vec4;
61
62 /* The size (in instrlen units (128 bytes)) of instruction cache where
63 * we preload a shader. Loading more than this could trigger a hang
64 * on gen3 and later.
65 */
66 uint32_t instr_cache_size;
67
68 bool has_hw_multiview;
69
70 bool has_fs_tex_prefetch;
71
72 /* Whether the PC_MULTIVIEW_MASK register exists. */
73 bool supports_multiview_mask;
74
75 /* info for setting RB_CCU_CNTL */
76 bool concurrent_resolve;
77 bool has_z24uint_s8uint;
78
79 bool tess_use_shared;
80
81 /* Does the hw support GL_QCOM_shading_rate? */
82 bool has_legacy_pipeline_shading_rate;
83
84 /* Whether a 16-bit descriptor can be used */
85 bool storage_16bit;
86
87 /* The latest known a630_sqe.fw fails to wait for WFI before
88 * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
89 * so we have to fall back to CP_WAIT_FOR_ME except for a650
90 * which has a fixed firmware.
91 *
92 * TODO: There may be newer a630_sqe.fw released in the future
93 * which fixes this, if so we should detect it and avoid this
94 * workaround. Once we have uapi to query fw version, we can
95 * replace this with minimum fw version.
96 */
97 bool indirect_draw_wfm_quirk;
98
99 /* On some GPUs, the depth test needs to be enabled when the
100 * depth bounds test is enabled and the depth attachment uses UBWC.
101 */
102 bool depth_bounds_require_depth_test_quirk;
103
104 bool has_tex_filter_cubic;
105
106 /* The blob driver does not support SEPARATE_RECONSTRUCTION_FILTER_BIT
107 * before a6xx_gen3. It still sets CHROMA_LINEAR bit according to
108 * chromaFilter, but the bit has no effect before a6xx_gen3.
109 */
110 bool has_separate_chroma_filter;
111
112 bool has_sample_locations;
113
114 /* The firmware on newer a6xx drops CP_REG_WRITE support as we
115 * can now use direct register writes for these regs.
116 */
117 bool has_cp_reg_write;
118
119 bool has_8bpp_ubwc;
120
121 bool has_lpac;
122
123 bool has_getfiberid;
124
125 bool has_dp2acc;
126 bool has_dp4acc;
127
128 /* LRZ fast-clear works on all gens, however blob disables it on
129 * gen1 and gen2. We also elect to disable fast-clear on these gens
130 * because for close to none gains it adds complexity and seem to work
131 * a bit differently from gen3+. Which creates at least one edge case:
132 * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
133 * the fast-clear value is undefined. For details see
134 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
135 */
136 bool enable_lrz_fast_clear;
137 bool has_lrz_dir_tracking;
138 bool lrz_track_quirk;
139 bool has_lrz_feedback;
140
141 /* Some generations have a bit to add the multiview index to the
142 * viewport index, which lets us implement different scaling for
143 * different views.
144 */
145 bool has_per_view_viewport;
146 bool has_gmem_fast_clear;
147
148 /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
149 * in sysmem rendering. */
150 uint32_t sysmem_per_ccu_depth_cache_size;
151 uint32_t sysmem_per_ccu_color_cache_size;
152 /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
153 * which require color cache (non-BLIT event case).
154 * The size is expressed as a fraction of ccu cache used by sysmem
155 * rendering. If a GMEM resolve requires color cache, the driver needs
156 * to make sure it will not overwrite pixel data in GMEM that is still
157 * needed.
158 */
159 /* see enum a6xx_ccu_cache_size */
160 uint32_t gmem_ccu_color_cache_fraction;
161
162 /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
163 uint32_t prim_alloc_threshold;
164
165 uint32_t vs_max_inputs_count;
166
167 bool supports_double_threadsize;
168
169 bool has_sampler_minmax;
170
171 bool broken_ds_ubwc_quirk;
172
173 /* See ir3_compiler::has_scalar_alu. */
174 bool has_scalar_alu;
175 /* See ir3_compiler::has_early_preamble. */
176 bool has_early_preamble;
177
178 bool has_isam_v;
179 bool has_ssbo_imm_offsets;
180
181 /* Whether writing to UBWC attachment and reading the same image as input
182 * attachment or as a texture reads correct values from the image.
183 * If this is false, we may read stale values from the flag buffer,
184 * thus reading incorrect values from the image.
185 * Happens with VK_EXT_attachment_feedback_loop_layout.
186 */
187 bool has_coherent_ubwc_flag_caches;
188
189 bool has_attachment_shading_rate;
190
191 /* Whether mipmaps below certain threshold can use LINEAR tiling when higher
192 * levels use UBWC,
193 */
194 bool has_ubwc_linear_mipmap_fallback;
195
196 /* Whether 4 nops are needed after the second pred[tf] of a
197 * pred[tf]/pred[ft] pair to work around a hardware issue.
198 */
199 bool predtf_nop_quirk;
200
201 /* Whether 6 nops are needed after prede to work around a hardware
202 * issue.
203 */
204 bool prede_nop_quirk;
205
206 /* Whether the sad instruction (iadd3) is supported. */
207 bool has_sad;
208
209 struct {
210 uint32_t PC_POWER_CNTL;
211 uint32_t TPL1_DBG_ECO_CNTL;
212 uint32_t GRAS_DBG_ECO_CNTL;
213 uint32_t SP_CHICKEN_BITS;
214 uint32_t UCHE_CLIENT_PF;
215 uint32_t PC_MODE_CNTL;
216 uint32_t SP_DBG_ECO_CNTL;
217 uint32_t RB_DBG_ECO_CNTL;
218 uint32_t RB_DBG_ECO_CNTL_blit;
219 uint32_t HLSQ_DBG_ECO_CNTL;
220 uint32_t RB_UNKNOWN_8E01;
221 uint32_t VPC_DBG_ECO_CNTL;
222 uint32_t UCHE_UNKNOWN_0E12;
223
224 uint32_t RB_UNKNOWN_8E06;
225 } magic;
226
227 struct {
228 uint32_t reg;
229 uint32_t value;
230 } magic_raw[64];
231
232 /* maximum number of descriptor sets */
233 uint32_t max_sets;
234
235 float line_width_min;
236 float line_width_max;
237 } a6xx;
238
239 struct {
240 /* stsc may need to be done twice for the same range to workaround
241 * _something_, observed in blob's disassembly.
242 */
243 bool stsc_duplication_quirk;
244
245 /* Whether there is CP_EVENT_WRITE7::WRITE_SAMPLE_COUNT */
246 bool has_event_write_sample_count;
247
248 bool has_64b_ssbo_atomics;
249
250 /* Blob executes a special compute dispatch at the start of each
251 * command buffers. We copy this dispatch as is.
252 */
253 bool cmdbuf_start_a725_quirk;
254
255 bool load_inline_uniforms_via_preamble_ldgk;
256 bool load_shader_consts_via_preamble;
257
258 bool has_gmem_vpc_attr_buf;
259 /* Size of buffer in gmem for VPC attributes */
260 uint32_t sysmem_vpc_attr_buf_size;
261 uint32_t gmem_vpc_attr_buf_size;
262
263 /* Whether UBWC is supported on all IBOs. Prior to this, only readonly
264 * or writeonly IBOs could use UBWC and mixing reads and writes was not
265 * permitted.
266 */
267 bool supports_ibo_ubwc;
268
269 /* Whether the UBWC fast-clear values for snorn, unorm, and int formats
270 * are the same. This is the case from a740 onwards. These formats were
271 * already otherwise UBWC-compatible, so this means that they are now
272 * fully compatible.
273 */
274 bool ubwc_unorm_snorm_int_compatible;
275
276 /* Having zero consts in one FS may corrupt consts in follow up FSs,
277 * on such GPUs blob never has zero consts in FS. The mechanism of
278 * corruption is unknown.
279 */
280 bool fs_must_have_non_zero_constlen_quirk;
281
282 /* On a750 there is a hardware bug where certain VPC sizes in a GS with
283 * an input primitive type that is a triangle with adjacency can hang
284 * with a high enough vertex count.
285 */
286 bool gs_vpc_adjacency_quirk;
287
288 /* On a740 TPL1_DBG_ECO_CNTL1.TP_UBWC_FLAG_HINT must be the same between
289 * all drivers in the system, somehow having different values affects
290 * BLIT_OP_SCALE. We cannot automatically match blob's value, so the
291 * best thing we could do is a toggle.
292 */
293 bool enable_tp_ubwc_flag_hint;
294
295 bool storage_8bit;
296
297 /* A750+ added a special flag that allows HW to correctly interpret UBWC, including
298 * UBWC fast-clear when casting image to a different format permitted by Vulkan.
299 * So it's possible to have UBWC enabled for image that has e.g. R32_UINT and
300 * R8G8B8A8_UNORM in the mutable formats list.
301 */
302 bool ubwc_all_formats_compatible;
303
304 bool has_compliant_dp4acc;
305
306 /* Whether a single clear blit could be used for both sysmem and gmem.*/
307 bool has_generic_clear;
308
309 /* Whether r8g8 UBWC fast-clear work correctly. */
310 bool r8g8_faulty_fast_clear_quirk;
311
312 /* a750 has a bug where writing and then reading a UBWC-compressed IBO
313 * requires flushing UCHE. This is reproducible in many CTS tests, for
314 * example dEQP-VK.image.load_store.with_format.2d.*.
315 */
316 bool ubwc_coherency_quirk;
317
318 /* Whether CP_ALWAYS_ON_COUNTER only resets on device loss rather than
319 * on every suspend/resume.
320 */
321 bool has_persistent_counter;
322
323 /* Whether only 256 vec4 constants are available for compute */
324 bool compute_constlen_quirk;
325
326 bool has_primitive_shading_rate;
327
328 /* A7XX gen1 and gen2 seem to require declaring SAMPLEMASK input
329 * for fragment shading rate to be read correctly.
330 * This workaround was seen in the prop driver v512.762.12.
331 */
332 bool reading_shading_rate_requires_smask_quirk;
333 } a7xx;
334 };
335
336 struct fd_dev_id {
337 uint32_t gpu_id;
338 uint64_t chip_id;
339 };
340
341 /**
342 * Note that gpu-id should be considered deprecated. For newer a6xx, if
343 * there is no gpu-id, this attempts to generate one from the chip-id.
344 * But that may not work forever, so avoid depending on this for newer
345 * gens
346 */
347 static inline uint32_t
fd_dev_gpu_id(const struct fd_dev_id * id)348 fd_dev_gpu_id(const struct fd_dev_id *id)
349 {
350 assert(id->gpu_id || id->chip_id);
351 if (!id->gpu_id) {
352 return ((id->chip_id >> 24) & 0xff) * 100 +
353 ((id->chip_id >> 16) & 0xff) * 10 +
354 ((id->chip_id >> 8) & 0xff);
355
356 }
357 return id->gpu_id;
358 }
359
360 /* Unmodified dev info as defined in freedreno_devices.py */
361 const struct fd_dev_info *fd_dev_info_raw(const struct fd_dev_id *id);
362
363 /* Final dev info with dbg options and everything else applied. */
364 const struct fd_dev_info fd_dev_info(const struct fd_dev_id *id);
365
366 const struct fd_dev_info *fd_dev_info_raw_by_name(const char *name);
367
368 static uint8_t
fd_dev_gen(const struct fd_dev_id * id)369 fd_dev_gen(const struct fd_dev_id *id)
370 {
371 return fd_dev_info_raw(id)->chip;
372 }
373
374 static inline bool
fd_dev_64b(const struct fd_dev_id * id)375 fd_dev_64b(const struct fd_dev_id *id)
376 {
377 return fd_dev_gen(id) >= 5;
378 }
379
380 /* per CCU GMEM amount reserved for depth cache for direct rendering */
381 #define A6XX_CCU_DEPTH_SIZE (64 * 1024)
382 /* per CCU GMEM amount reserved for color cache used by GMEM resolves
383 * which require color cache (non-BLIT event case).
384 * this is smaller than what is normally used by direct rendering
385 * (RB_CCU_CNTL.GMEM bit enables this smaller size)
386 * if a GMEM resolve requires color cache, the driver needs to make sure
387 * it will not overwrite pixel data in GMEM that is still needed
388 */
389 #define A6XX_CCU_GMEM_COLOR_SIZE (16 * 1024)
390
391 const char * fd_dev_name(const struct fd_dev_id *id);
392
393 void
394 fd_dev_info_apply_dbg_options(struct fd_dev_info *info);
395
396 #ifdef __cplusplus
397 } /* end of extern "C" */
398 #endif
399
400 #endif /* FREEDRENO_DEVICE_INFO_H */
401