1 /*
2 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #ifndef FREEDRENO_BATCH_H_
28 #define FREEDRENO_BATCH_H_
29
30 #include "util/list.h"
31 #include "util/simple_mtx.h"
32 #include "util/u_inlines.h"
33 #include "util/u_queue.h"
34 #include "util/perf/u_trace.h"
35
36 #include "freedreno_context.h"
37 #include "freedreno_fence.h"
38 #include "freedreno_util.h"
39
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43
44 struct fd_resource;
45 struct fd_batch_key;
46 struct fd_batch_result;
47
48 /**
49 * A subpass is a fragment of a batch potentially starting with a clear.
50 * If the app does a mid-batch clear, that clear and subsequent draws
51 * can be split out into another sub-pass. At gmem time, the appropriate
52 * sysmem or gmem clears can be interleaved with the CP_INDIRECT_BUFFER
53 * to the subpass's draw cmdstream.
54 *
55 * For depth clears, a replacement LRZ buffer can be allocated (clear
56 * still inserted into the prologue cmdstream since it needs be executed
57 * even in sysmem or if we aren't binning, since later batches could
58 * depend in the LRZ state). The alternative would be to invalidate
59 * LRZ for draws after the start of the new subpass.
60 */
61 struct fd_batch_subpass {
62 struct list_head node;
63
64 /** draw pass cmdstream: */
65 struct fd_ringbuffer *draw;
66
67 /** for the gmem code to stash per tile per subpass clears */
68 struct fd_ringbuffer *subpass_clears;
69
70 BITMASK_ENUM(fd_buffer_mask) fast_cleared;
71
72 union pipe_color_union clear_color[MAX_RENDER_TARGETS];
73 double clear_depth;
74 unsigned clear_stencil;
75
76 /**
77 * The number of draws emitted to this subpass. If it is greater than
78 * zero, a clear triggers creating a new subpass (because clears must
79 * always come at the start of a subpass).
80 */
81 unsigned num_draws;
82
83 /**
84 * If a subpass starts with a LRZ clear, it gets a new LRZ buffer.
85 * The fd_resource::lrz always tracks the current lrz buffer, but at
86 * binning/gmem time we need to know what was the current lrz buffer
87 * at the time draws were emitted to the subpass. Which is tracked
88 * here.
89 */
90 struct fd_bo *lrz;
91 };
92
93 /**
94 * A batch tracks everything about a cmdstream batch/submit, including the
95 * ringbuffers used for binning, draw, and gmem cmds, list of associated
96 * fd_resource-s, etc.
97 */
98 struct fd_batch {
99 struct pipe_reference reference;
100 unsigned seqno;
101 unsigned idx; /* index into cache->batches[] */
102
103 struct u_trace trace;
104
105 /* To detect cases where we can skip cmdstream to record timestamp: */
106 uint32_t *last_timestamp_cmd;
107
108 int in_fence_fd;
109 struct pipe_fence_handle *fence;
110
111 struct fd_context *ctx;
112
113 /* do we need to mem2gmem before rendering. We don't, if for example,
114 * there was a glClear() that invalidated the entire previous buffer
115 * contents. Keep track of which buffer(s) are cleared, or needs
116 * restore. Masks of PIPE_CLEAR_*
117 *
118 * The 'cleared' bits will be set for buffers which are *entirely*
119 * cleared.
120 *
121 * The 'invalidated' bits are set for cleared buffers, and buffers
122 * where the contents are undefined, ie. what we don't need to restore
123 * to gmem.
124 */
125 BITMASK_ENUM(fd_buffer_mask) invalidated, cleared, restore, resolve;
126
127 /* is this a non-draw batch (ie compute/blit which has no pfb state)? */
128 bool nondraw : 1;
129 bool needs_flush : 1;
130 bool flushed : 1;
131 bool tessellation : 1; /* tessellation used in batch */
132
133 /* Keep track if WAIT_FOR_IDLE is needed for registers we need
134 * to update via RMW:
135 */
136 bool needs_wfi : 1;
137
138 /* To decide whether to render to system memory, keep track of the
139 * number of draws, and whether any of them require multisample,
140 * depth_test (or depth write), stencil_test, blending, and
141 * color_logic_Op (since those functions are disabled when by-
142 * passing GMEM.
143 */
144 BITMASK_ENUM(fd_gmem_reason) gmem_reason;
145
146 /* At submit time, once we've decided that this batch will use GMEM
147 * rendering, the appropriate gmem state is looked up:
148 */
149 const struct fd_gmem_stateobj *gmem_state;
150
151 /* Driver specific barrier/flush flags: */
152 unsigned barrier;
153
154 /* A calculated "draw cost" value for the batch, which tries to
155 * estimate the bandwidth-per-sample of all the draws according
156 * to:
157 *
158 * foreach_draw (...) {
159 * cost += num_mrt;
160 * if (blend_enabled)
161 * cost += num_mrt;
162 * if (depth_test_enabled)
163 * cost++;
164 * if (depth_write_enabled)
165 * cost++;
166 * }
167 *
168 * The idea is that each sample-passed minimally does one write
169 * per MRT. If blend is enabled, the hw will additionally do
170 * a framebuffer read per sample-passed (for each MRT with blend
171 * enabled). If depth-test is enabled, the hw will additionally
172 * a depth buffer read. If depth-write is enable, the hw will
173 * additionally do a depth buffer write.
174 *
175 * This does ignore depth buffer traffic for samples which do not
176 * pass do to depth-test fail, and some other details. But it is
177 * just intended to be a rough estimate that is easy to calculate.
178 */
179 unsigned cost;
180
181 /* Tells the gen specific backend where to write stats used for
182 * the autotune module.
183 *
184 * Pointer only valid during gmem emit code.
185 */
186 struct fd_batch_result *autotune_result;
187
188 unsigned num_draws; /* number of draws in current batch */
189 unsigned num_vertices; /* number of vertices in current batch */
190
191 /* Currently only used on a6xx, to calculate vsc prim/draw stream
192 * sizes:
193 */
194 unsigned num_bins_per_pipe;
195 unsigned prim_strm_bits;
196 unsigned draw_strm_bits;
197
198 /* Track the maximal bounds of the scissor of all the draws within a
199 * batch. Used at the tile rendering step (fd_gmem_render_tiles(),
200 * mem2gmem/gmem2mem) to avoid needlessly moving data in/out of gmem.
201 *
202 * Note that unlike gallium state, maxx/maxy are inclusive (for
203 * fully covered 512x512 the scissor would be 0,0+511,511)
204 */
205 struct pipe_scissor_state max_scissor;
206
207 /* Keep track of DRAW initiators that need to be patched up depending
208 * on whether we using binning or not:
209 */
210 struct util_dynarray draw_patches;
211
212 /* texture state that needs patching for fb_read: */
213 struct util_dynarray fb_read_patches;
214
215 /* Keep track of writes to RB_RENDER_CONTROL which need to be patched
216 * once we know whether or not to use GMEM, and GMEM tile pitch.
217 *
218 * (only for a3xx.. but having gen specific subclasses of fd_batch
219 * seemed overkill for now)
220 */
221 struct util_dynarray rbrc_patches;
222
223 /* Keep track of GMEM related values that need to be patched up once we
224 * know the gmem layout:
225 */
226 struct util_dynarray gmem_patches;
227
228 /* Keep track of pointer to start of MEM exports for a20x binning shaders
229 *
230 * this is so the end of the shader can be cut off at the right point
231 * depending on the GMEM configuration
232 */
233 struct util_dynarray shader_patches;
234
235 struct pipe_framebuffer_state framebuffer;
236
237 struct fd_submit *submit;
238
239 /**
240 * List of fd_batch_subpass.
241 */
242 struct list_head subpasses;
243
244 #define foreach_subpass(subpass, batch) \
245 list_for_each_entry (struct fd_batch_subpass, subpass, &batch->subpasses, node)
246 #define foreach_subpass_safe(subpass, batch) \
247 list_for_each_entry_safe (struct fd_batch_subpass, subpass, &batch->subpasses, node)
248
249 /**
250 * The current subpass.
251 */
252 struct fd_batch_subpass *subpass;
253
254 /**
255 * just a reference to the current subpass's draw cmds for backwards compat.
256 */
257 struct fd_ringbuffer *draw;
258 /** binning pass cmdstream: */
259 struct fd_ringbuffer *binning;
260 /** tiling/gmem (IB0) cmdstream: */
261 struct fd_ringbuffer *gmem;
262
263 /** preemble cmdstream (executed once before first tile): */
264 struct fd_ringbuffer *prologue;
265
266 /** epilogue cmdstream (executed after each tile): */
267 struct fd_ringbuffer *tile_epilogue;
268
269 /** epilogue cmdstream (executed after all tiles): */
270 struct fd_ringbuffer *epilogue;
271
272 struct fd_ringbuffer *tile_loads;
273 struct fd_ringbuffer *tile_store;
274
275 /**
276 * hw query related state:
277 */
278 /*@{*/
279 /* next sample offset.. incremented for each sample in the batch/
280 * submit, reset to zero on next submit.
281 */
282 uint32_t next_sample_offset;
283
284 /* The # of pipeline-stats queries running. In case of nested
285 * queries using {START/STOP}_{PRIMITIVE,FRAGMENT,COMPUTE}_CNTRS,
286 * we need to start only on the first one and stop only on the
287 * last one.
288 */
289 uint8_t pipeline_stats_queries_active[3];
290
291 /* cached samples (in case multiple queries need to reference
292 * the same sample snapshot)
293 */
294 struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS];
295
296 /* which sample providers were used in the current batch: */
297 uint32_t query_providers_used;
298
299 /* which sample providers are currently enabled in the batch: */
300 uint32_t query_providers_active;
301
302 /* list of samples in current batch: */
303 struct util_dynarray samples;
304
305 /* current query result bo and tile stride: */
306 struct pipe_resource *query_buf;
307 uint32_t query_tile_stride;
308 /*@}*/
309
310 /* Set of resources used by currently-unsubmitted batch (read or
311 * write).. does not hold a reference to the resource.
312 */
313 struct set *resources;
314
315 /** key in batch-cache (if not null): */
316 struct fd_batch_key *key;
317 uint32_t hash;
318
319 /** set of dependent batches.. holds refs to dependent batches: */
320 uint32_t dependents_mask;
321 };
322
323 struct fd_batch *fd_batch_create(struct fd_context *ctx, bool nondraw);
324
325 struct fd_batch_subpass *fd_batch_create_subpass(struct fd_batch *batch) assert_dt;
326
327 void fd_batch_set_fb(struct fd_batch *batch, const struct pipe_framebuffer_state *pfb) assert_dt;
328
329 void fd_batch_flush(struct fd_batch *batch) assert_dt;
330 bool fd_batch_has_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt;
331 void fd_batch_add_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt;
332 void fd_batch_resource_write(struct fd_batch *batch,
333 struct fd_resource *rsc) assert_dt;
334 void fd_batch_resource_read_slowpath(struct fd_batch *batch,
335 struct fd_resource *rsc) assert_dt;
336 void fd_batch_check_size(struct fd_batch *batch) assert_dt;
337
338 uint32_t fd_batch_key_hash(const void *_key);
339 bool fd_batch_key_equals(const void *_a, const void *_b);
340 struct fd_batch_key *fd_batch_key_clone(void *mem_ctx,
341 const struct fd_batch_key *key);
342
343 /* not called directly: */
344 void __fd_batch_describe(char *buf, const struct fd_batch *batch) assert_dt;
345 void __fd_batch_destroy_locked(struct fd_batch *batch);
346 void __fd_batch_destroy(struct fd_batch *batch);
347
348 /*
349 * NOTE the rule is, you need to hold the screen->lock when destroying
350 * a batch.. so either use fd_batch_reference() (which grabs the lock
351 * for you) if you don't hold the lock, or fd_batch_reference_locked()
352 * if you do hold the lock.
353 *
354 * WARNING the _locked() version can briefly drop the lock. Without
355 * recursive mutexes, I'm not sure there is much else we can do (since
356 * __fd_batch_destroy() needs to unref resources)
357 *
358 * WARNING you must acquire the screen->lock and use the _locked()
359 * version in case that the batch being ref'd can disappear under
360 * you.
361 */
362
363 static inline void
fd_batch_reference_locked(struct fd_batch ** ptr,struct fd_batch * batch)364 fd_batch_reference_locked(struct fd_batch **ptr, struct fd_batch *batch)
365 {
366 struct fd_batch *old_batch = *ptr;
367
368 /* only need lock if a reference is dropped: */
369 if (old_batch)
370 fd_screen_assert_locked(old_batch->ctx->screen);
371
372 if (pipe_reference_described(
373 &(*ptr)->reference, &batch->reference,
374 (debug_reference_descriptor)__fd_batch_describe))
375 __fd_batch_destroy_locked(old_batch);
376
377 *ptr = batch;
378 }
379
380 static inline void
fd_batch_reference(struct fd_batch ** ptr,struct fd_batch * batch)381 fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
382 {
383 struct fd_batch *old_batch = *ptr;
384
385 if (pipe_reference_described(
386 &(*ptr)->reference, &batch->reference,
387 (debug_reference_descriptor)__fd_batch_describe))
388 __fd_batch_destroy(old_batch);
389
390 *ptr = batch;
391 }
392
393 /**
394 * Mark the batch as having something worth flushing (rendering, blit, query,
395 * etc)
396 */
397 static inline void
fd_batch_needs_flush(struct fd_batch * batch)398 fd_batch_needs_flush(struct fd_batch *batch)
399 {
400 batch->needs_flush = true;
401 fd_pipe_fence_ref(&batch->ctx->last_fence, NULL);
402 }
403
404 /* Since we reorder batches and can pause/resume queries (notably for disabling
405 * queries dueing some meta operations), we update the current query state for
406 * the batch before each draw.
407 */
408 static inline void
fd_batch_update_queries(struct fd_batch * batch)409 fd_batch_update_queries(struct fd_batch *batch) assert_dt
410 {
411 struct fd_context *ctx = batch->ctx;
412
413 if (!(ctx->dirty & FD_DIRTY_QUERY))
414 return;
415
416 ctx->query_update_batch(batch, false);
417 }
418
419 static inline void
fd_batch_finish_queries(struct fd_batch * batch)420 fd_batch_finish_queries(struct fd_batch *batch) assert_dt
421 {
422 struct fd_context *ctx = batch->ctx;
423
424 ctx->query_update_batch(batch, true);
425 }
426
427 static inline void
fd_reset_wfi(struct fd_batch * batch)428 fd_reset_wfi(struct fd_batch *batch)
429 {
430 batch->needs_wfi = true;
431 }
432
433 void fd_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt;
434
435 /* emit a CP_EVENT_WRITE:
436 */
437 static inline void
fd_event_write(struct fd_batch * batch,struct fd_ringbuffer * ring,enum vgt_event_type evt)438 fd_event_write(struct fd_batch *batch, struct fd_ringbuffer *ring,
439 enum vgt_event_type evt)
440 {
441 OUT_PKT3(ring, CP_EVENT_WRITE, 1);
442 OUT_RING(ring, evt);
443 fd_reset_wfi(batch);
444 }
445
446 /* Get per-tile epilogue */
447 static inline struct fd_ringbuffer *
fd_batch_get_tile_epilogue(struct fd_batch * batch)448 fd_batch_get_tile_epilogue(struct fd_batch *batch)
449 {
450 if (batch->tile_epilogue == NULL) {
451 batch->tile_epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
452 FD_RINGBUFFER_GROWABLE);
453 }
454
455 return batch->tile_epilogue;
456 }
457
458 /* Get epilogue run after all tiles*/
459 static inline struct fd_ringbuffer *
fd_batch_get_epilogue(struct fd_batch * batch)460 fd_batch_get_epilogue(struct fd_batch *batch)
461 {
462 if (batch->epilogue == NULL) {
463 batch->epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
464 FD_RINGBUFFER_GROWABLE);
465 }
466
467 return batch->epilogue;
468 }
469
470 struct fd_ringbuffer *fd_batch_get_prologue(struct fd_batch *batch);
471
472 #ifdef __cplusplus
473 }
474 #endif
475
476 #endif /* FREEDRENO_BATCH_H_ */
477