1 /*
2 * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #ifndef FREEDRENO_UTIL_H_
28 #define FREEDRENO_UTIL_H_
29
30 #include "common/freedreno_common.h"
31
32 #include "drm/freedreno_drmif.h"
33 #include "drm/freedreno_ringbuffer.h"
34
35 #include "util/format/u_formats.h"
36 #include "pipe/p_state.h"
37 #include "util/compiler.h"
38 #include "util/half_float.h"
39 #include "util/log.h"
40 #ifndef __cplusplus // TODO fix cpu_trace.h to be c++ friendly
41 #include "util/perf/cpu_trace.h"
42 #endif
43 #include "util/u_debug.h"
44 #include "util/u_dynarray.h"
45 #include "util/u_math.h"
46 #include "util/u_pack_color.h"
47
48 #include "adreno_common.xml.h"
49 #include "adreno_pm4.xml.h"
50 #include "disasm.h"
51
52 #ifdef __cplusplus
53 extern "C" {
54 #endif
55
56 enum adreno_rb_depth_format fd_pipe2depth(enum pipe_format format);
57 enum pc_di_index_size fd_pipe2index(enum pipe_format format);
58 enum pipe_format fd_gmem_restore_format(enum pipe_format format);
59 enum adreno_rb_blend_factor fd_blend_factor(unsigned factor);
60 enum adreno_pa_su_sc_draw fd_polygon_mode(unsigned mode);
61 enum adreno_stencil_op fd_stencil_op(unsigned op);
62
63 #define A3XX_MAX_MIP_LEVELS 14
64
65 #define A2XX_MAX_RENDER_TARGETS 1
66 #define A3XX_MAX_RENDER_TARGETS 4
67 #define A4XX_MAX_RENDER_TARGETS 8
68 #define A5XX_MAX_RENDER_TARGETS 8
69 #define A6XX_MAX_RENDER_TARGETS 8
70
71 #define MAX_RENDER_TARGETS A6XX_MAX_RENDER_TARGETS
72
73 /* clang-format off */
74 enum fd_debug_flag {
75 FD_DBG_MSGS = BITFIELD_BIT(0),
76 FD_DBG_DISASM = BITFIELD_BIT(1),
77 FD_DBG_DCLEAR = BITFIELD_BIT(2),
78 FD_DBG_DDRAW = BITFIELD_BIT(3),
79 FD_DBG_NOSCIS = BITFIELD_BIT(4),
80 FD_DBG_DIRECT = BITFIELD_BIT(5),
81 FD_DBG_GMEM = BITFIELD_BIT(6),
82 FD_DBG_PERF = BITFIELD_BIT(7),
83 FD_DBG_NOBIN = BITFIELD_BIT(8),
84 FD_DBG_SYSMEM = BITFIELD_BIT(9),
85 FD_DBG_SERIALC = BITFIELD_BIT(10),
86 FD_DBG_SHADERDB = BITFIELD_BIT(11),
87 FD_DBG_FLUSH = BITFIELD_BIT(12),
88 FD_DBG_DEQP = BITFIELD_BIT(13),
89 FD_DBG_INORDER = BITFIELD_BIT(14),
90 FD_DBG_BSTAT = BITFIELD_BIT(15),
91 FD_DBG_NOGROW = BITFIELD_BIT(16),
92 FD_DBG_LRZ = BITFIELD_BIT(17),
93 FD_DBG_NOINDR = BITFIELD_BIT(18),
94 FD_DBG_NOBLIT = BITFIELD_BIT(19),
95 FD_DBG_HIPRIO = BITFIELD_BIT(20),
96 FD_DBG_TTILE = BITFIELD_BIT(21),
97 FD_DBG_PERFC = BITFIELD_BIT(22),
98 FD_DBG_NOUBWC = BITFIELD_BIT(23),
99 FD_DBG_NOLRZ = BITFIELD_BIT(24),
100 FD_DBG_NOTILE = BITFIELD_BIT(25),
101 FD_DBG_LAYOUT = BITFIELD_BIT(26),
102 FD_DBG_NOFP16 = BITFIELD_BIT(27),
103 FD_DBG_NOHW = BITFIELD_BIT(28),
104 FD_DBG_NOSBIN = BITFIELD_BIT(29),
105 };
106 /* clang-format on */
107
108 extern int fd_mesa_debug;
109 extern bool fd_binning_enabled;
110
111 #define FD_DBG(category) unlikely(fd_mesa_debug &FD_DBG_##category)
112
113 #include <unistd.h>
114 #include <sys/types.h>
115 #include <sys/syscall.h>
116
117 #define DBG(fmt, ...) \
118 do { \
119 if (FD_DBG(MSGS)) \
120 mesa_logi("%5d: %s:%d: " fmt, ((pid_t)syscall(SYS_gettid)), \
121 __func__, __LINE__, \
122 ##__VA_ARGS__); \
123 } while (0)
124
125 #define perf_debug_message(debug, type, ...) \
126 do { \
127 if (FD_DBG(PERF)) \
128 mesa_logw(__VA_ARGS__); \
129 struct util_debug_callback *__d = (debug); \
130 if (__d) \
131 util_debug_message(__d, type, __VA_ARGS__); \
132 } while (0)
133
134 #define perf_debug_ctx(ctx, ...) \
135 do { \
136 struct fd_context *__c = (ctx); \
137 perf_debug_message(__c ? &__c->debug : NULL, PERF_INFO, __VA_ARGS__); \
138 } while (0)
139
140 #define perf_debug(...) perf_debug_ctx(NULL, __VA_ARGS__)
141
142 #define perf_time_ctx(ctx, limit_ns, fmt, ...) \
143 for (struct __perf_time_state __s = \
144 { \
145 .t = -__perf_get_time(ctx), \
146 }; \
147 !__s.done; ({ \
148 __s.t += __perf_get_time(ctx); \
149 __s.done = true; \
150 if (__s.t > (limit_ns)) { \
151 perf_debug_ctx(ctx, fmt " (%.03f ms)", ##__VA_ARGS__, \
152 (double)__s.t / 1000000.0); \
153 } \
154 }))
155
156 #define perf_time(limit_ns, fmt, ...) \
157 perf_time_ctx(NULL, limit_ns, fmt, ##__VA_ARGS__)
158
159 struct __perf_time_state {
160 int64_t t;
161 bool done;
162 };
163
164 /* static inline would be nice here, except 'struct fd_context' is not
165 * defined yet:
166 */
167 #define __perf_get_time(ctx) \
168 ((FD_DBG(PERF) || ({ \
169 struct fd_context *__c = (ctx); \
170 unlikely(__c && __c->debug.debug_message); \
171 })) \
172 ? os_time_get_nano() \
173 : 0)
174
175 #define DEFINE_CAST(parent, child) \
176 static inline struct child *child(struct parent *x) \
177 { \
178 return (struct child *)x; \
179 }
180
181 struct fd_context;
182
183 /**
184 * A psuedo-variable for defining where various parts of the fd_context
185 * can be safely accessed.
186 *
187 * With threaded_context, certain pctx funcs are called from gallium
188 * front-end/state-tracker (eg. CSO creation), while others are called
189 * from the driver thread. Things called from driver thread can safely
190 * access anything in the ctx, while things called from the fe/st thread
191 * must limit themselves to "safe" things (ie. ctx->screen is safe as it
192 * is immutable, but the blitter_context is not).
193 */
194 extern lock_cap_t fd_context_access_cap;
195
196 /**
197 * Make the annotation a bit less verbose.. mark fields which should only
198 * be accessed by driver-thread with 'dt'
199 */
200 #define dt guarded_by(fd_context_access_cap)
201
202 /**
203 * Annotation for entry-point functions only called in driver thread.
204 *
205 * For static functions, apply the annotation to the function declaration.
206 * Otherwise apply to the function prototype.
207 */
208 #define in_dt assert_cap(fd_context_access_cap)
209
210 /**
211 * Annotation for internal functions which are only called from entry-
212 * point functions (with 'in_dt' annotation) or other internal functions
213 * with the 'assert_dt' annotation.
214 *
215 * For static functions, apply the annotation to the function declaration.
216 * Otherwise apply to the function prototype.
217 */
218 #define assert_dt requires_cap(fd_context_access_cap)
219
220 /**
221 * Special helpers for context access outside of driver thread. For ex,
222 * pctx->get_query_result() is not called on driver thread, but the
223 * query is guaranteed to be flushed, or the driver thread queue is
224 * guaranteed to be flushed.
225 *
226 * Use with caution!
227 */
228 static inline void
fd_context_access_begin(struct fd_context * ctx)229 fd_context_access_begin(struct fd_context *ctx)
230 acquire_cap(fd_context_access_cap)
231 {
232 }
233
234 static inline void
fd_context_access_end(struct fd_context * ctx)235 fd_context_access_end(struct fd_context *ctx) release_cap(fd_context_access_cap)
236 {
237 }
238
239 #define CP_REG(reg) ((0x4 << 16) | ((unsigned int)((reg) - (0x2000))))
240
241 static inline uint32_t
DRAW(enum pc_di_primtype prim_type,enum pc_di_src_sel source_select,enum pc_di_index_size index_size,enum pc_di_vis_cull_mode vis_cull_mode,uint8_t instances)242 DRAW(enum pc_di_primtype prim_type, enum pc_di_src_sel source_select,
243 enum pc_di_index_size index_size, enum pc_di_vis_cull_mode vis_cull_mode,
244 uint8_t instances)
245 {
246 return (prim_type << 0) | (source_select << 6) | ((index_size & 1) << 11) |
247 ((index_size >> 1) << 13) | (vis_cull_mode << 9) | (1 << 14) |
248 (instances << 24);
249 }
250
251 static inline uint32_t
DRAW_A20X(enum pc_di_primtype prim_type,enum pc_di_face_cull_sel faceness_cull_select,enum pc_di_src_sel source_select,enum pc_di_index_size index_size,bool pre_fetch_cull_enable,bool grp_cull_enable,uint16_t count)252 DRAW_A20X(enum pc_di_primtype prim_type,
253 enum pc_di_face_cull_sel faceness_cull_select,
254 enum pc_di_src_sel source_select, enum pc_di_index_size index_size,
255 bool pre_fetch_cull_enable, bool grp_cull_enable, uint16_t count)
256 {
257 return (prim_type << 0) | (source_select << 6) |
258 (faceness_cull_select << 8) | ((index_size & 1) << 11) |
259 ((index_size >> 1) << 13) | (pre_fetch_cull_enable << 14) |
260 (grp_cull_enable << 15) | (count << 16);
261 }
262
263 /* for tracking cmdstream positions that need to be patched: */
264 struct fd_cs_patch {
265 uint32_t *cs;
266 uint32_t val;
267 };
268 #define fd_patch_num_elements(buf) ((buf)->size / sizeof(struct fd_cs_patch))
269 #define fd_patch_element(buf, i) \
270 util_dynarray_element(buf, struct fd_cs_patch, i)
271
272 static inline enum pipe_format
pipe_surface_format(struct pipe_surface * psurf)273 pipe_surface_format(struct pipe_surface *psurf)
274 {
275 if (!psurf)
276 return PIPE_FORMAT_NONE;
277 return psurf->format;
278 }
279
280 static inline bool
fd_surface_half_precision(const struct pipe_surface * psurf)281 fd_surface_half_precision(const struct pipe_surface *psurf)
282 {
283 enum pipe_format format;
284
285 if (!psurf)
286 return true;
287
288 format = psurf->format;
289
290 /* colors are provided in consts, which go through cov.f32f16, which will
291 * break these values
292 */
293 if (util_format_is_pure_integer(format))
294 return false;
295
296 /* avoid losing precision on 32-bit float formats */
297 if (util_format_is_float(format) &&
298 util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) ==
299 32)
300 return false;
301
302 return true;
303 }
304
305 static inline unsigned
fd_sampler_first_level(const struct pipe_sampler_view * view)306 fd_sampler_first_level(const struct pipe_sampler_view *view)
307 {
308 if (view->target == PIPE_BUFFER)
309 return 0;
310 return view->u.tex.first_level;
311 }
312
313 static inline unsigned
fd_sampler_last_level(const struct pipe_sampler_view * view)314 fd_sampler_last_level(const struct pipe_sampler_view *view)
315 {
316 if (view->target == PIPE_BUFFER)
317 return 0;
318 return view->u.tex.last_level;
319 }
320
321 static inline bool
fd_half_precision(struct pipe_framebuffer_state * pfb)322 fd_half_precision(struct pipe_framebuffer_state *pfb)
323 {
324 unsigned i;
325
326 for (i = 0; i < pfb->nr_cbufs; i++)
327 if (!fd_surface_half_precision(pfb->cbufs[i]))
328 return false;
329
330 return true;
331 }
332
333 static inline void emit_marker(struct fd_ringbuffer *ring, int scratch_idx);
334
335 /* like OUT_RING() but appends a cmdstream patch point to 'buf' */
336 static inline void
OUT_RINGP(struct fd_ringbuffer * ring,uint32_t data,struct util_dynarray * buf)337 OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data, struct util_dynarray *buf)
338 {
339 if (LOG_DWORDS) {
340 DBG("ring[%p]: OUT_RINGP %04x: %08x", ring,
341 (uint32_t)(ring->cur - ring->start), data);
342 }
343 util_dynarray_append(buf, struct fd_cs_patch,
344 ((struct fd_cs_patch){
345 .cs = ring->cur++,
346 .val = data,
347 }));
348 }
349
350 static inline void
__OUT_IB(struct fd_ringbuffer * ring,bool prefetch,struct fd_ringbuffer * target)351 __OUT_IB(struct fd_ringbuffer *ring, bool prefetch,
352 struct fd_ringbuffer *target)
353 {
354 if (target->cur == target->start)
355 return;
356
357 unsigned count = fd_ringbuffer_cmd_count(target);
358
359 /* for debug after a lock up, write a unique counter value
360 * to scratch6 for each IB, to make it easier to match up
361 * register dumps to cmdstream. The combination of IB and
362 * DRAW (scratch7) is enough to "triangulate" the particular
363 * draw that caused lockup.
364 */
365 emit_marker(ring, 6);
366
367 for (unsigned i = 0; i < count; i++) {
368 uint32_t dwords;
369 OUT_PKT3(ring, prefetch ? CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD,
370 2);
371 dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4;
372 assert(dwords > 0);
373 OUT_RING(ring, dwords);
374 OUT_PKT2(ring);
375 }
376
377 emit_marker(ring, 6);
378 }
379
380 static inline void
__OUT_IB5(struct fd_ringbuffer * ring,struct fd_ringbuffer * target)381 __OUT_IB5(struct fd_ringbuffer *ring, struct fd_ringbuffer *target)
382 {
383 if (target->cur == target->start)
384 return;
385
386 unsigned count = fd_ringbuffer_cmd_count(target);
387
388 for (unsigned i = 0; i < count; i++) {
389 uint32_t dwords;
390 OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3);
391 dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4;
392 assert(dwords > 0);
393 OUT_RING(ring, dwords);
394 }
395 }
396
397 /* CP_SCRATCH_REG4 is used to hold base address for query results:
398 * Note the scratch register move on a5xx+ but this is only used
399 * for pre-a5xx hw queries where we cannot allocate the query buf
400 * until the # of tiles is known.
401 */
402 #define HW_QUERY_BASE_REG REG_AXXX_CP_SCRATCH_REG4
403
404 #ifdef DEBUG
405 #define __EMIT_MARKER 1
406 #else
407 #define __EMIT_MARKER 0
408 #endif
409
410 static inline void
emit_marker(struct fd_ringbuffer * ring,int scratch_idx)411 emit_marker(struct fd_ringbuffer *ring, int scratch_idx)
412 {
413 extern int32_t marker_cnt;
414 unsigned reg = REG_AXXX_CP_SCRATCH_REG0 + scratch_idx;
415 assert(reg != HW_QUERY_BASE_REG);
416 if (reg == HW_QUERY_BASE_REG)
417 return;
418 if (__EMIT_MARKER) {
419 OUT_WFI(ring);
420 OUT_PKT0(ring, reg, 1);
421 OUT_RING(ring, p_atomic_inc_return(&marker_cnt));
422 }
423 }
424
425
426 /*
427 * a3xx+ helpers:
428 */
429
430 static inline enum a3xx_msaa_samples
fd_msaa_samples(unsigned samples)431 fd_msaa_samples(unsigned samples)
432 {
433 switch (samples) {
434 default:
435 unreachable("Unsupported samples");
436 case 0:
437 case 1:
438 return MSAA_ONE;
439 case 2:
440 return MSAA_TWO;
441 case 4:
442 return MSAA_FOUR;
443 case 8:
444 return MSAA_EIGHT;
445 }
446 }
447
448 #define A3XX_MAX_TEXEL_BUFFER_ELEMENTS_UINT (1 << 13)
449
450 /* Note that the Vulkan blob on a540 and 640 report a
451 * maxTexelBufferElements of just 65536 (the GLES3.2 and Vulkan
452 * minimum).
453 */
454 #define A4XX_MAX_TEXEL_BUFFER_ELEMENTS_UINT (1 << 27)
455
456 static inline uint32_t
fd_clamp_buffer_size(enum pipe_format format,uint32_t size,unsigned max_texel_buffer_elements)457 fd_clamp_buffer_size(enum pipe_format format, uint32_t size,
458 unsigned max_texel_buffer_elements)
459 {
460 /* The spec says:
461 * The number of texels in the texel array is then clamped to the value of
462 * the implementation-dependent limit GL_MAX_TEXTURE_BUFFER_SIZE.
463 *
464 * So compute the number of texels, compare to GL_MAX_TEXTURE_BUFFER_SIZE and update it.
465 */
466 unsigned blocksize = util_format_get_blocksize(format);
467 unsigned elements = MIN2(max_texel_buffer_elements, size / blocksize);
468
469 return elements * blocksize;
470 }
471
472
473 /*
474 * a4xx+ helpers:
475 */
476
477 static inline enum a4xx_state_block
fd4_stage2shadersb(gl_shader_stage type)478 fd4_stage2shadersb(gl_shader_stage type)
479 {
480 switch (type) {
481 case MESA_SHADER_VERTEX:
482 return SB4_VS_SHADER;
483 case MESA_SHADER_FRAGMENT:
484 return SB4_FS_SHADER;
485 case MESA_SHADER_COMPUTE:
486 case MESA_SHADER_KERNEL:
487 return SB4_CS_SHADER;
488 default:
489 unreachable("bad shader type");
490 return (enum a4xx_state_block) ~0;
491 }
492 }
493
494 static inline enum a4xx_index_size
fd4_size2indextype(unsigned index_size)495 fd4_size2indextype(unsigned index_size)
496 {
497 switch (index_size) {
498 case 1:
499 return INDEX4_SIZE_8_BIT;
500 case 2:
501 return INDEX4_SIZE_16_BIT;
502 case 4:
503 return INDEX4_SIZE_32_BIT;
504 }
505 DBG("unsupported index size: %d", index_size);
506 assert(0);
507 return INDEX4_SIZE_32_BIT;
508 }
509
510 /* Convert 19.2MHz RBBM always-on timer ticks to ns */
511 static inline uint64_t
ticks_to_ns(uint64_t ts)512 ticks_to_ns(uint64_t ts)
513 {
514 return ts * (1000000000 / 19200000);
515 }
516
517 #ifdef __cplusplus
518 }
519 #endif
520
521 #endif /* FREEDRENO_UTIL_H_ */
522