1 /**************************************************************************
2 *
3 * Copyright 2017 Advanced Micro Devices, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * on the rights to use, copy, modify, merge, publish, distribute, sub
10 * license, and/or sell copies of the Software, and to permit persons to whom
11 * the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 *
25 **************************************************************************/
26
27 /* This is a wrapper for pipe_context that executes all pipe_context calls
28 * in another thread.
29 *
30 *
31 * Guidelines for adopters and deviations from Gallium
32 * ---------------------------------------------------
33 *
34 * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
35 * driver functions that take a context (fence_finish, texture_get_handle)
36 * should manually unwrap pipe_context by doing:
37 * pipe = threaded_context_unwrap_sync(pipe);
38 *
39 * pipe_context::priv is used to unwrap the context, so drivers and state
40 * trackers shouldn't use it.
41 *
42 * No other objects are wrapped.
43 *
44 * 2) Drivers must subclass and initialize these structures:
45 * - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
46 * - threaded_query for pipe_query (zero memory)
47 * - threaded_transfer for pipe_transfer (zero memory)
48 *
49 * 3) The threaded context must not be enabled for contexts that can use video
50 * codecs.
51 *
52 * 4) Changes in driver behavior:
53 * - begin_query and end_query always return true; return values from
54 * the driver are ignored.
55 * - generate_mipmap uses is_format_supported to determine success;
56 * the return value from the driver is ignored.
57 * - resource_commit always returns true; failures are ignored.
58 * - set_debug_callback is skipped if the callback is synchronous.
59 *
60 *
61 * Thread-safety requirements on context functions
62 * -----------------------------------------------
63 *
64 * These pipe_context functions are executed directly, so they shouldn't use
65 * pipe_context in an unsafe way. They are de-facto screen functions now:
66 * - create_query
67 * - create_batch_query
68 * - create_*_state (all CSOs and shaders)
69 * - Make sure the shader compiler doesn't use any per-context stuff.
70 * (e.g. LLVM target machine)
71 * - Only pipe_context's debug callback for shader dumps is guaranteed to
72 * be up to date, because set_debug_callback synchronizes execution.
73 * - create_surface
74 * - surface_destroy
75 * - create_sampler_view
76 * - sampler_view_destroy
77 * - stream_output_target_destroy
78 * - transfer_map (only unsychronized buffer mappings)
79 * - get_query_result (when threaded_query::flushed == true)
80 * - create_stream_output_target
81 *
82 *
83 * Transfer_map rules for buffer mappings
84 * --------------------------------------
85 *
86 * 1) If transfer_map has PIPE_MAP_UNSYNCHRONIZED, the call is made
87 * in the non-driver thread without flushing the queue. The driver will
88 * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_MAP_-
89 * UNSYNCHRONIZED to indicate this.
90 * Note that transfer_unmap is always enqueued and called from the driver
91 * thread.
92 *
93 * 2) The driver isn't allowed to infer unsychronized mappings by tracking
94 * the valid buffer range. The threaded context always sends TC_TRANSFER_-
95 * MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
96 * to failures.
97 * The threaded context does its own detection of unsynchronized mappings.
98 *
99 * 3) The driver isn't allowed to do buffer invalidations by itself under any
100 * circumstances. This is necessary for unsychronized maps to map the latest
101 * version of the buffer. (because invalidations can be queued, while
102 * unsychronized maps are not queued and they should return the latest
103 * storage after invalidation). The threaded context always sends
104 * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
105 * indicate this. Ignoring the flag will lead to failures.
106 * The threaded context uses its own buffer invalidation mechanism.
107 *
108 * 4) PIPE_MAP_ONCE can no longer be used to infer that a buffer will not be mapped
109 * a second time before it is unmapped.
110 *
111 *
112 * Rules for fences
113 * ----------------
114 *
115 * Flushes will be executed asynchronously in the driver thread if a
116 * create_fence callback is provided. This affects fence semantics as follows.
117 *
118 * When the threaded context wants to perform an asynchronous flush, it will
119 * use the create_fence callback to pre-create the fence from the calling
120 * thread. This pre-created fence will be passed to pipe_context::flush
121 * together with the TC_FLUSH_ASYNC flag.
122 *
123 * The callback receives the unwrapped context as a parameter, but must use it
124 * in a thread-safe way because it is called from a non-driver thread.
125 *
126 * If the threaded_context does not immediately flush the current batch, the
127 * callback also receives a tc_unflushed_batch_token. If fence_finish is called
128 * on the returned fence in the context that created the fence,
129 * threaded_context_flush must be called.
130 *
131 * The driver must implement pipe_context::fence_server_sync properly, since
132 * the threaded context handles PIPE_FLUSH_ASYNC.
133 *
134 *
135 * Additional requirements
136 * -----------------------
137 *
138 * get_query_result:
139 * If threaded_query::flushed == true, get_query_result should assume that
140 * it's called from a non-driver thread, in which case the driver shouldn't
141 * use the context in an unsafe way.
142 *
143 * replace_buffer_storage:
144 * The driver has to implement this callback, which will be called when
145 * the threaded context wants to replace a resource's backing storage with
146 * another resource's backing storage. The threaded context uses it to
147 * implement buffer invalidation. This call is always queued.
148 * Note that 'minimum_num_rebinds' specifies only the minimum number of rebinds
149 * which must be managed by the driver; if a buffer is bound multiple times in
150 * the same binding point (e.g., vertex buffer slots 0,1,2), this will be counted
151 * as a single rebind.
152 *
153 *
154 * Optional resource busy callbacks for better performance
155 * -------------------------------------------------------
156 *
157 * This adds checking whether a resource is used by the GPU and whether
158 * a resource is referenced by an unflushed command buffer. If neither is true,
159 * the threaded context will map the buffer as UNSYNCHRONIZED without flushing
160 * or synchronizing the thread and will skip any buffer invalidations
161 * (reallocations) because invalidating an idle buffer has no benefit.
162 *
163 * There are 1 driver callback and 1 TC callback:
164 *
165 * 1) is_resource_busy: It returns true when a resource is busy. If this is NULL,
166 * the resource is considered always busy.
167 *
168 * 2) tc_driver_internal_flush_notify: If the driver set
169 * driver_calls_flush_notify = true in threaded_context_create, it should
170 * call this after every internal driver flush. The threaded context uses it
171 * to track internal driver flushes for the purpose of tracking which
172 * buffers are referenced by an unflushed command buffer.
173 *
174 * If is_resource_busy is set, threaded_resource::buffer_id_unique must be
175 * generated by the driver, and the replace_buffer_storage callback should
176 * delete the buffer ID passed to it. The driver should use
177 * util_idalloc_mt_init_tc.
178 *
179 *
180 * How it works (queue architecture)
181 * ---------------------------------
182 *
183 * There is a multithreaded queue consisting of batches, each batch containing
184 * 8-byte slots. Calls can occupy 1 or more slots.
185 *
186 * Once a batch is full and there is no space for the next call, it's flushed,
187 * meaning that it's added to the queue for execution in the other thread.
188 * The batches are ordered in a ring and reused once they are idle again.
189 * The batching is necessary for low queue/mutex overhead.
190 */
191
192 #ifndef U_THREADED_CONTEXT_H
193 #define U_THREADED_CONTEXT_H
194
195 #include "c11/threads.h"
196 #include "pipe/p_context.h"
197 #include "pipe/p_state.h"
198 #include "util/bitset.h"
199 #include "util/u_inlines.h"
200 #include "util/u_queue.h"
201 #include "util/u_range.h"
202 #include "util/u_thread.h"
203 #include "util/slab.h"
204
205 struct threaded_context;
206 struct tc_unflushed_batch_token;
207
208 /* 0 = disabled, 1 = assertions, 2 = printfs, 3 = logging */
209 #define TC_DEBUG 0
210
211 /* These are map flags sent to drivers. */
212 /* Never infer whether it's safe to use unsychronized mappings: */
213 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
214 /* Don't invalidate buffers: */
215 #define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30)
216 /* transfer_map is called from a non-driver thread: */
217 #define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31)
218
219 /* Custom flush flags sent to drivers. */
220 /* fence is pre-populated with a fence created by the create_fence callback */
221 #define TC_FLUSH_ASYNC (1u << 31)
222
223 /* Size of the queue = number of batch slots in memory.
224 * - 1 batch is always idle and records new commands
225 * - 1 batch is being executed
226 * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
227 *
228 * Use a size as small as possible for low CPU L2 cache usage but large enough
229 * so that the queue isn't stalled too often for not having enough idle batch
230 * slots.
231 */
232 #define TC_MAX_BATCHES 10
233
234 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
235 * can occupy multiple call slots.
236 *
237 * The idea is to have batches as small as possible but large enough so that
238 * the queuing and mutex overhead is negligible.
239 */
240 #define TC_SLOTS_PER_BATCH 1536
241
242 /* The buffer list queue is much deeper than the batch queue because buffer
243 * lists need to stay around until the driver internally flushes its command
244 * buffer.
245 */
246 #define TC_MAX_BUFFER_LISTS (TC_MAX_BATCHES * 4)
247
248 /* This mask is used to get a hash of a buffer ID. It's also the bit size of
249 * the buffer list - 1. It must be 2^n - 1. The size should be as low as
250 * possible to minimize memory usage, but high enough to minimize hash
251 * collisions.
252 */
253 #define TC_BUFFER_ID_MASK BITFIELD_MASK(14)
254
255 /* Threshold for when to use the queue or sync. */
256 #define TC_MAX_STRING_MARKER_BYTES 512
257
258 /* Threshold for when to enqueue buffer/texture_subdata as-is.
259 * If the upload size is greater than this, it will do instead:
260 * - for buffers: DISCARD_RANGE is done by the threaded context
261 * - for textures: sync and call the driver directly
262 */
263 #define TC_MAX_SUBDATA_BYTES 320
264
265 enum tc_binding_type {
266 TC_BINDING_VERTEX_BUFFER,
267 TC_BINDING_STREAMOUT_BUFFER,
268 TC_BINDING_UBO_VS,
269 TC_BINDING_UBO_FS,
270 TC_BINDING_UBO_GS,
271 TC_BINDING_UBO_TCS,
272 TC_BINDING_UBO_TES,
273 TC_BINDING_UBO_CS,
274 TC_BINDING_SAMPLERVIEW_VS,
275 TC_BINDING_SAMPLERVIEW_FS,
276 TC_BINDING_SAMPLERVIEW_GS,
277 TC_BINDING_SAMPLERVIEW_TCS,
278 TC_BINDING_SAMPLERVIEW_TES,
279 TC_BINDING_SAMPLERVIEW_CS,
280 TC_BINDING_SSBO_VS,
281 TC_BINDING_SSBO_FS,
282 TC_BINDING_SSBO_GS,
283 TC_BINDING_SSBO_TCS,
284 TC_BINDING_SSBO_TES,
285 TC_BINDING_SSBO_CS,
286 TC_BINDING_IMAGE_VS,
287 TC_BINDING_IMAGE_FS,
288 TC_BINDING_IMAGE_GS,
289 TC_BINDING_IMAGE_TCS,
290 TC_BINDING_IMAGE_TES,
291 TC_BINDING_IMAGE_CS,
292 };
293
294 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
295 struct pipe_resource *dst,
296 struct pipe_resource *src,
297 unsigned minimum_num_rebinds,
298 uint32_t rebind_mask,
299 uint32_t delete_buffer_id);
300 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
301 struct tc_unflushed_batch_token *token);
302 typedef bool (*tc_is_resource_busy)(struct pipe_screen *screen,
303 struct pipe_resource *resource,
304 unsigned usage);
305
306 struct threaded_resource {
307 struct pipe_resource b;
308
309 /* Since buffer invalidations are queued, we can't use the base resource
310 * for unsychronized mappings. This points to the latest version of
311 * the buffer after the latest invalidation. It's only used for unsychro-
312 * nized mappings in the non-driver thread. Initially it's set to &b.
313 */
314 struct pipe_resource *latest;
315
316 /* The buffer range which is initialized (with a write transfer, streamout,
317 * or writable shader resources). The remainder of the buffer is considered
318 * invalid and can be mapped unsynchronized.
319 *
320 * This allows unsychronized mapping of a buffer range which hasn't been
321 * used yet. It's for applications which forget to use the unsynchronized
322 * map flag and expect the driver to figure it out.
323 *
324 * Drivers should set this to the full range for buffers backed by user
325 * memory.
326 */
327 struct util_range valid_buffer_range;
328
329 /* Drivers are required to update this for shared resources and user
330 * pointers. */
331 bool is_shared;
332 bool is_user_ptr;
333
334 /* Unique buffer ID. Drivers must set it to non-zero for buffers and it must
335 * be unique. Textures must set 0. Low bits are used as a hash of the ID.
336 * Use util_idalloc_mt to generate these IDs.
337 */
338 uint32_t buffer_id_unique;
339
340 /* If positive, then a staging transfer is in progress.
341 */
342 int pending_staging_uploads;
343
344 /* If staging uploads are pending, this will hold the union of the mapped
345 * ranges.
346 */
347 struct util_range pending_staging_uploads_range;
348 };
349
350 struct threaded_transfer {
351 struct pipe_transfer b;
352
353 /* Staging buffer for DISCARD_RANGE transfers. */
354 struct pipe_resource *staging;
355
356 /* If b.resource is not the base instance of the buffer, but it's one of its
357 * reallocations (set in "latest" of the base instance), this points to
358 * the valid range of the base instance. It's used for transfers after
359 * a buffer invalidation, because such transfers operate on "latest", not
360 * the base instance. Initially it's set to &b.resource->valid_buffer_range.
361 */
362 struct util_range *valid_buffer_range;
363 };
364
365 struct threaded_query {
366 /* The query is added to the list in end_query and removed in flush. */
367 struct list_head head_unflushed;
368
369 /* Whether pipe->flush has been called in non-deferred mode after end_query. */
370 bool flushed;
371 };
372
373 struct tc_call_base {
374 #if !defined(NDEBUG) && TC_DEBUG >= 1
375 uint32_t sentinel;
376 #endif
377 ushort num_slots;
378 ushort call_id;
379 };
380
381 /**
382 * A token representing an unflushed batch.
383 *
384 * See the general rules for fences for an explanation.
385 */
386 struct tc_unflushed_batch_token {
387 struct pipe_reference ref;
388 struct threaded_context *tc;
389 };
390
391 struct tc_batch {
392 struct threaded_context *tc;
393 #if !defined(NDEBUG) && TC_DEBUG >= 1
394 unsigned sentinel;
395 #endif
396 uint16_t num_total_slots;
397 uint16_t buffer_list_index;
398 struct util_queue_fence fence;
399 struct tc_unflushed_batch_token *token;
400 uint64_t slots[TC_SLOTS_PER_BATCH];
401 };
402
403 struct tc_buffer_list {
404 /* Signalled by the driver after it flushes its internal command buffer. */
405 struct util_queue_fence driver_flushed_fence;
406
407 /* Buffer list where bit N means whether ID hash N is in the list. */
408 BITSET_DECLARE(buffer_list, TC_BUFFER_ID_MASK + 1);
409 };
410
411 /**
412 * Optional TC parameters/callbacks.
413 */
414 struct threaded_context_options {
415 tc_create_fence_func create_fence;
416 tc_is_resource_busy is_resource_busy;
417 bool driver_calls_flush_notify;
418
419 /**
420 * If true, ctx->get_device_reset_status() will be called without
421 * synchronizing with driver thread. Drivers can enable this to avoid
422 * TC syncs if their implementation of get_device_reset_status() is
423 * safe to call without synchronizing with driver thread.
424 */
425 bool unsynchronized_get_device_reset_status;
426 };
427
428 struct threaded_context {
429 struct pipe_context base;
430 struct pipe_context *pipe;
431 struct slab_child_pool pool_transfers;
432 tc_replace_buffer_storage_func replace_buffer_storage;
433 struct threaded_context_options options;
434 unsigned map_buffer_alignment;
435 unsigned ubo_alignment;
436
437 struct list_head unflushed_queries;
438
439 /* Counters for the HUD. */
440 unsigned num_offloaded_slots;
441 unsigned num_direct_slots;
442 unsigned num_syncs;
443
444 bool use_forced_staging_uploads;
445 bool add_all_gfx_bindings_to_buffer_list;
446 bool add_all_compute_bindings_to_buffer_list;
447
448 /* Estimation of how much vram/gtt bytes are mmap'd in
449 * the current tc_batch.
450 */
451 uint64_t bytes_mapped_estimate;
452 uint64_t bytes_mapped_limit;
453
454 struct util_queue queue;
455 struct util_queue_fence *fence;
456
457 #ifndef NDEBUG
458 /**
459 * The driver thread is normally the queue thread, but
460 * there are cases where the queue is flushed directly
461 * from the frontend thread
462 */
463 thread_id driver_thread;
464 #endif
465
466 bool seen_tcs;
467 bool seen_tes;
468 bool seen_gs;
469
470 bool seen_streamout_buffers;
471 bool seen_shader_buffers[PIPE_SHADER_TYPES];
472 bool seen_image_buffers[PIPE_SHADER_TYPES];
473 bool seen_sampler_buffers[PIPE_SHADER_TYPES];
474
475 unsigned max_vertex_buffers;
476 unsigned max_const_buffers;
477 unsigned max_shader_buffers;
478 unsigned max_images;
479 unsigned max_samplers;
480
481 unsigned last, next, next_buf_list;
482
483 /* The list fences that the driver should signal after the next flush.
484 * If this is empty, all driver command buffers have been flushed.
485 */
486 struct util_queue_fence *signal_fences_next_flush[TC_MAX_BUFFER_LISTS];
487 unsigned num_signal_fences_next_flush;
488
489 /* Bound buffers are tracked here using threaded_resource::buffer_id_hash.
490 * 0 means unbound.
491 */
492 uint32_t vertex_buffers[PIPE_MAX_ATTRIBS];
493 uint32_t streamout_buffers[PIPE_MAX_SO_BUFFERS];
494 uint32_t const_buffers[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
495 uint32_t shader_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS];
496 uint32_t image_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
497 uint32_t shader_buffers_writeable_mask[PIPE_SHADER_TYPES];
498 uint32_t image_buffers_writeable_mask[PIPE_SHADER_TYPES];
499 /* Don't use PIPE_MAX_SHADER_SAMPLER_VIEWS because it's too large. */
500 uint32_t sampler_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
501
502 struct tc_batch batch_slots[TC_MAX_BATCHES];
503 struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS];
504 };
505
506 void threaded_resource_init(struct pipe_resource *res);
507 void threaded_resource_deinit(struct pipe_resource *res);
508 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
509 void tc_driver_internal_flush_notify(struct threaded_context *tc);
510
511 struct pipe_context *
512 threaded_context_create(struct pipe_context *pipe,
513 struct slab_parent_pool *parent_transfer_pool,
514 tc_replace_buffer_storage_func replace_buffer,
515 const struct threaded_context_options *options,
516 struct threaded_context **out);
517
518 void
519 threaded_context_init_bytes_mapped_limit(struct threaded_context *tc, unsigned divisor);
520
521 void
522 threaded_context_flush(struct pipe_context *_pipe,
523 struct tc_unflushed_batch_token *token,
524 bool prefer_async);
525
526 void
527 tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
528 unsigned drawid_offset,
529 const struct pipe_draw_indirect_info *indirect,
530 const struct pipe_draw_start_count_bias *draws,
531 unsigned num_draws);
532
533 static inline struct threaded_context *
threaded_context(struct pipe_context * pipe)534 threaded_context(struct pipe_context *pipe)
535 {
536 return (struct threaded_context*)pipe;
537 }
538
539 static inline struct threaded_resource *
threaded_resource(struct pipe_resource * res)540 threaded_resource(struct pipe_resource *res)
541 {
542 return (struct threaded_resource*)res;
543 }
544
545 static inline struct threaded_query *
threaded_query(struct pipe_query * q)546 threaded_query(struct pipe_query *q)
547 {
548 return (struct threaded_query*)q;
549 }
550
551 static inline struct threaded_transfer *
threaded_transfer(struct pipe_transfer * transfer)552 threaded_transfer(struct pipe_transfer *transfer)
553 {
554 return (struct threaded_transfer*)transfer;
555 }
556
557 static inline void
tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token ** dst,struct tc_unflushed_batch_token * src)558 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
559 struct tc_unflushed_batch_token *src)
560 {
561 if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src))
562 free(*dst);
563 *dst = src;
564 }
565
566 /**
567 * Helper for !NDEBUG builds to assert that it is called from driver
568 * thread. This is to help drivers ensure that various code-paths
569 * are not hit indirectly from pipe entry points that are called from
570 * front-end/state-tracker thread.
571 */
572 static inline void
tc_assert_driver_thread(struct threaded_context * tc)573 tc_assert_driver_thread(struct threaded_context *tc)
574 {
575 if (!tc)
576 return;
577 #ifndef NDEBUG
578 assert(util_thread_id_equal(tc->driver_thread, util_get_thread_id()));
579 #endif
580 }
581
582 #endif
583