• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2017 Advanced Micro Devices, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * on the rights to use, copy, modify, merge, publish, distribute, sub
10  * license, and/or sell copies of the Software, and to permit persons to whom
11  * the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23  * USE OR OTHER DEALINGS IN THE SOFTWARE.
24  *
25  **************************************************************************/
26 
27 /* This is a wrapper for pipe_context that executes all pipe_context calls
28  * in another thread.
29  *
30  *
31  * Guidelines for adopters and deviations from Gallium
32  * ---------------------------------------------------
33  *
34  * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
35  *    driver functions that take a context (fence_finish, texture_get_handle)
36  *    should manually unwrap pipe_context by doing:
37  *      pipe = threaded_context_unwrap_sync(pipe);
38  *
39  *    pipe_context::priv is used to unwrap the context, so drivers and state
40  *    trackers shouldn't use it.
41  *
42  *    No other objects are wrapped.
43  *
44  * 2) Drivers must subclass and initialize these structures:
45  *    - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
46  *    - threaded_query for pipe_query (zero memory)
47  *    - threaded_transfer for pipe_transfer (zero memory)
48  *
49  * 3) The threaded context must not be enabled for contexts that can use video
50  *    codecs.
51  *
52  * 4) Changes in driver behavior:
53  *    - begin_query and end_query always return true; return values from
54  *      the driver are ignored.
55  *    - generate_mipmap uses is_format_supported to determine success;
56  *      the return value from the driver is ignored.
57  *    - resource_commit always returns true; failures are ignored.
58  *    - set_debug_callback is skipped if the callback is synchronous.
59  *
60  *
61  * Thread-safety requirements on context functions
62  * -----------------------------------------------
63  *
64  * These pipe_context functions are executed directly, so they shouldn't use
65  * pipe_context in an unsafe way. They are de-facto screen functions now:
66  * - create_query
67  * - create_batch_query
68  * - create_*_state (all CSOs and shaders)
69  *     - Make sure the shader compiler doesn't use any per-context stuff.
70  *       (e.g. LLVM target machine)
71  *     - Only pipe_context's debug callback for shader dumps is guaranteed to
72  *       be up to date, because set_debug_callback synchronizes execution.
73  * - create_surface
74  * - surface_destroy
75  * - create_sampler_view
76  * - sampler_view_destroy
77  * - stream_output_target_destroy
78  * - transfer_map (only unsychronized buffer mappings)
79  * - get_query_result (when threaded_query::flushed == true)
80  * - create_stream_output_target
81  * - get_sample_position
82  *
83  *
84  * Transfer_map rules for buffer mappings
85  * --------------------------------------
86  *
87  * 1) If transfer_map has PIPE_MAP_UNSYNCHRONIZED, the call is made
88  *    in the non-driver thread without flushing the queue. The driver will
89  *    receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_MAP_-
90  *    UNSYNCHRONIZED to indicate this.
91  *    Note that transfer_unmap is always enqueued and called from the driver
92  *    thread.
93  *
94  * 2) The driver isn't allowed to infer unsychronized mappings by tracking
95  *    the valid buffer range. The threaded context always sends TC_TRANSFER_-
96  *    MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
97  *    to failures.
98  *    The threaded context does its own detection of unsynchronized mappings.
99  *
100  * 3) The driver isn't allowed to do buffer invalidations by itself under any
101  *    circumstances. This is necessary for unsychronized maps to map the latest
102  *    version of the buffer. (because invalidations can be queued, while
103  *    unsychronized maps are not queued and they should return the latest
104  *    storage after invalidation). The threaded context always sends
105  *    TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
106  *    indicate this. Ignoring the flag will lead to failures.
107  *    The threaded context uses its own buffer invalidation mechanism.
108  *    Do NOT use pipe_buffer_write, as this may trigger invalidation;
109  *    use tc_buffer_write instead.
110  *
111  * 4) PIPE_MAP_ONCE can no longer be used to infer that a buffer will not be mapped
112  *    a second time before it is unmapped.
113  *
114  *
115  * Rules for fences
116  * ----------------
117  *
118  * Flushes will be executed asynchronously in the driver thread if a
119  * create_fence callback is provided. This affects fence semantics as follows.
120  *
121  * When the threaded context wants to perform an asynchronous flush, it will
122  * use the create_fence callback to pre-create the fence from the calling
123  * thread. This pre-created fence will be passed to pipe_context::flush
124  * together with the TC_FLUSH_ASYNC flag.
125  *
126  * The callback receives the unwrapped context as a parameter, but must use it
127  * in a thread-safe way because it is called from a non-driver thread.
128  *
129  * If the threaded_context does not immediately flush the current batch, the
130  * callback also receives a tc_unflushed_batch_token. If fence_finish is called
131  * on the returned fence in the context that created the fence,
132  * threaded_context_flush must be called.
133  *
134  * The driver must implement pipe_context::fence_server_sync properly, since
135  * the threaded context handles PIPE_FLUSH_ASYNC.
136  *
137  *
138  * Additional requirements
139  * -----------------------
140  *
141  * get_query_result:
142  *    If threaded_query::flushed == true, get_query_result should assume that
143  *    it's called from a non-driver thread, in which case the driver shouldn't
144  *    use the context in an unsafe way.
145  *
146  * replace_buffer_storage:
147  *    The driver has to implement this callback, which will be called when
148  *    the threaded context wants to replace a resource's backing storage with
149  *    another resource's backing storage. The threaded context uses it to
150  *    implement buffer invalidation. This call is always queued.
151  *    Note that 'minimum_num_rebinds' specifies only the minimum number of rebinds
152  *    which must be managed by the driver; if a buffer is bound multiple times in
153  *    the same binding point (e.g., vertex buffer slots 0,1,2), this will be counted
154  *    as a single rebind.
155  *    A buffer which has had its backing storage replaced may have its backing storage
156  *    accessed through multiple pipe_resources.
157  *
158  *
159  * Optional resource busy callbacks for better performance
160  * -------------------------------------------------------
161  *
162  * This adds checking whether a resource is used by the GPU and whether
163  * a resource is referenced by an unflushed command buffer. If neither is true,
164  * the threaded context will map the buffer as UNSYNCHRONIZED without flushing
165  * or synchronizing the thread and will skip any buffer invalidations
166  * (reallocations) because invalidating an idle buffer has no benefit.
167  *
168  * There are 1 driver callback and 1 TC callback:
169  *
170  * 1) is_resource_busy: It returns true when a resource is busy. If this is NULL,
171  *    the resource is considered always busy.
172  *
173  * 2) tc_driver_internal_flush_notify: If the driver set
174  *    driver_calls_flush_notify = true in threaded_context_create, it should
175  *    call this after every internal driver flush. The threaded context uses it
176  *    to track internal driver flushes for the purpose of tracking which
177  *    buffers are referenced by an unflushed command buffer.
178  *
179  * If is_resource_busy is set, threaded_resource::buffer_id_unique must be
180  * generated by the driver, and the replace_buffer_storage callback should
181  * delete the buffer ID passed to it. The driver should use
182  * util_idalloc_mt_init_tc.
183  *
184  *
185  * How it works (queue architecture)
186  * ---------------------------------
187  *
188  * There is a multithreaded queue consisting of batches, each batch containing
189  * 8-byte slots. Calls can occupy 1 or more slots.
190  *
191  * Once a batch is full and there is no space for the next call, it's flushed,
192  * meaning that it's added to the queue for execution in the other thread.
193  * The batches are ordered in a ring and reused once they are idle again.
194  * The batching is necessary for low queue/mutex overhead.
195  */
196 
197 #ifndef U_THREADED_CONTEXT_H
198 #define U_THREADED_CONTEXT_H
199 
200 #include "c11/threads.h"
201 #include "pipe/p_context.h"
202 #include "pipe/p_state.h"
203 #include "util/bitset.h"
204 #include "util/u_inlines.h"
205 #include "util/u_memory.h"
206 #include "util/u_queue.h"
207 #include "util/u_range.h"
208 #include "util/u_thread.h"
209 #include "util/slab.h"
210 #include "util/u_dynarray.h"
211 
212 #ifdef __cplusplus
213 extern "C" {
214 #endif
215 
216 struct threaded_context;
217 struct tc_unflushed_batch_token;
218 
219 /* 0 = disabled, 1 = assertions, 2 = printfs, 3 = logging */
220 #define TC_DEBUG 0
221 
222 /* This is an internal flag not sent to the driver. */
223 #define TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE   (1u << 28)
224 /* These are map flags sent to drivers. */
225 /* Never infer whether it's safe to use unsychronized mappings: */
226 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
227 /* Don't invalidate buffers: */
228 #define TC_TRANSFER_MAP_NO_INVALIDATE        (1u << 30)
229 /* transfer_map is called from a non-driver thread: */
230 #define TC_TRANSFER_MAP_THREADED_UNSYNC      (1u << 31)
231 
232 /* Custom flush flags sent to drivers. */
233 /* fence is pre-populated with a fence created by the create_fence callback */
234 #define TC_FLUSH_ASYNC        (1u << 31)
235 
236 /* Size of the queue = number of batch slots in memory.
237  * - 1 batch is always idle and records new commands
238  * - 1 batch is being executed
239  * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
240  *
241  * Use a size as small as possible for low CPU L2 cache usage but large enough
242  * so that the queue isn't stalled too often for not having enough idle batch
243  * slots.
244  */
245 #define TC_MAX_BATCHES        10
246 
247 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
248  * can occupy multiple call slots.
249  *
250  * The idea is to have batches as small as possible but large enough so that
251  * the queuing and mutex overhead is negligible.
252  */
253 #define TC_SLOTS_PER_BATCH    1536
254 
255 /* The buffer list queue is much deeper than the batch queue because buffer
256  * lists need to stay around until the driver internally flushes its command
257  * buffer.
258  */
259 #define TC_MAX_BUFFER_LISTS   (TC_MAX_BATCHES * 4)
260 
261 /* This mask is used to get a hash of a buffer ID. It's also the bit size of
262  * the buffer list - 1. It must be 2^n - 1. The size should be as low as
263  * possible to minimize memory usage, but high enough to minimize hash
264  * collisions.
265  */
266 #define TC_BUFFER_ID_MASK      BITFIELD_MASK(14)
267 
268 /* Threshold for when to use the queue or sync. */
269 #define TC_MAX_STRING_MARKER_BYTES  512
270 
271 /* Threshold for when to enqueue buffer/texture_subdata as-is.
272  * If the upload size is greater than this, it will do instead:
273  * - for buffers: DISCARD_RANGE is done by the threaded context
274  * - for textures: sync and call the driver directly
275  */
276 #define TC_MAX_SUBDATA_BYTES        320
277 
278 enum tc_binding_type {
279    TC_BINDING_VERTEX_BUFFER,
280    TC_BINDING_STREAMOUT_BUFFER,
281    TC_BINDING_UBO_VS,
282    TC_BINDING_UBO_FS,
283    TC_BINDING_UBO_GS,
284    TC_BINDING_UBO_TCS,
285    TC_BINDING_UBO_TES,
286    TC_BINDING_UBO_CS,
287    TC_BINDING_SAMPLERVIEW_VS,
288    TC_BINDING_SAMPLERVIEW_FS,
289    TC_BINDING_SAMPLERVIEW_GS,
290    TC_BINDING_SAMPLERVIEW_TCS,
291    TC_BINDING_SAMPLERVIEW_TES,
292    TC_BINDING_SAMPLERVIEW_CS,
293    TC_BINDING_SSBO_VS,
294    TC_BINDING_SSBO_FS,
295    TC_BINDING_SSBO_GS,
296    TC_BINDING_SSBO_TCS,
297    TC_BINDING_SSBO_TES,
298    TC_BINDING_SSBO_CS,
299    TC_BINDING_IMAGE_VS,
300    TC_BINDING_IMAGE_FS,
301    TC_BINDING_IMAGE_GS,
302    TC_BINDING_IMAGE_TCS,
303    TC_BINDING_IMAGE_TES,
304    TC_BINDING_IMAGE_CS,
305 };
306 
307 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
308                                                struct pipe_resource *dst,
309                                                struct pipe_resource *src,
310                                                unsigned minimum_num_rebinds,
311                                                uint32_t rebind_mask,
312                                                uint32_t delete_buffer_id);
313 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
314                                                           struct tc_unflushed_batch_token *token);
315 typedef bool (*tc_is_resource_busy)(struct pipe_screen *screen,
316                                     struct pipe_resource *resource,
317                                     unsigned usage);
318 
319 struct threaded_resource {
320    struct pipe_resource b;
321 
322    /* Since buffer invalidations are queued, we can't use the base resource
323     * for unsychronized mappings. This points to the latest version of
324     * the buffer after the latest invalidation. It's only used for unsychro-
325     * nized mappings in the non-driver thread. Initially it's set to &b.
326     */
327    struct pipe_resource *latest;
328 
329    /* Optional CPU storage of the buffer. When we get partial glBufferSubData(implemented by
330     * copy_buffer) + glDrawElements, we don't want to drain the gfx pipeline before executing
331     * the copy. For ideal pipelining, we upload to this CPU storage and then reallocate
332     * the GPU storage completely and reupload everything without copy_buffer.
333     */
334    void *cpu_storage;
335 
336    /* The buffer range which is initialized (with a write transfer, streamout,
337     * or writable shader resources). The remainder of the buffer is considered
338     * invalid and can be mapped unsynchronized.
339     *
340     * This allows unsychronized mapping of a buffer range which hasn't been
341     * used yet. It's for applications which forget to use the unsynchronized
342     * map flag and expect the driver to figure it out.
343     *
344     * Drivers should set this to the full range for buffers backed by user
345     * memory.
346     */
347    struct util_range valid_buffer_range;
348 
349    /* Drivers are required to update this for shared resources and user
350     * pointers. */
351    bool is_shared;
352    bool is_user_ptr;
353    bool allow_cpu_storage;
354 
355    /* internal tag for tc indicating which batch last touched this resource */
356    int8_t last_batch_usage;
357    /* for disambiguating last_batch_usage across batch cycles */
358    uint32_t batch_generation;
359 
360    /* Unique buffer ID. Drivers must set it to non-zero for buffers and it must
361     * be unique. Textures must set 0. Low bits are used as a hash of the ID.
362     * Use util_idalloc_mt to generate these IDs.
363     */
364    uint32_t buffer_id_unique;
365 
366    /* If positive, then a staging transfer is in progress.
367     */
368    int pending_staging_uploads;
369 
370    /* If staging uploads are pending, this will hold the union of the mapped
371     * ranges.
372     */
373    struct util_range pending_staging_uploads_range;
374 };
375 
376 struct threaded_transfer {
377    struct pipe_transfer b;
378 
379    /* Staging buffer for DISCARD_RANGE transfers. */
380    struct pipe_resource *staging;
381 
382    /* If b.resource is not the base instance of the buffer, but it's one of its
383     * reallocations (set in "latest" of the base instance), this points to
384     * the valid range of the base instance. It's used for transfers after
385     * a buffer invalidation, because such transfers operate on "latest", not
386     * the base instance. Initially it's set to &b.resource->valid_buffer_range.
387     */
388    struct util_range *valid_buffer_range;
389 
390    bool cpu_storage_mapped;
391 };
392 
393 struct threaded_query {
394    /* The query is added to the list in end_query and removed in flush. */
395    struct list_head head_unflushed;
396 
397    /* Whether pipe->flush has been called in non-deferred mode after end_query. */
398    bool flushed;
399 };
400 
401 struct tc_call_base {
402 #if !defined(NDEBUG) && TC_DEBUG >= 1
403    uint32_t sentinel;
404 #endif
405    uint16_t num_slots;
406    uint16_t call_id;
407 };
408 
409 struct tc_draw_single {
410    struct tc_call_base base;
411    unsigned index_bias;
412    struct pipe_draw_info info;
413 };
414 
415 /**
416  * A token representing an unflushed batch.
417  *
418  * See the general rules for fences for an explanation.
419  */
420 struct tc_unflushed_batch_token {
421    struct pipe_reference ref;
422    struct threaded_context *tc;
423 };
424 
425 struct tc_renderpass_info {
426    union {
427       struct {
428          /* bitmask of full-cleared color buffers */
429          uint8_t cbuf_clear;
430          /* bitmask of not-full-cleared color buffers */
431          uint8_t cbuf_load;
432          /* bitmask of color buffers that have their stores invalidated */
433          uint8_t cbuf_invalidate;
434          /* whether the zsbuf is full-cleared */
435          bool zsbuf_clear : 1;
436          /* whether the zsbuf is partial-cleared */
437          bool zsbuf_clear_partial : 1;
438          /* whether the zsbuf is not-full-cleared */
439          bool zsbuf_load : 1;
440          /* whether the zsbuf is invalidated */
441          bool zsbuf_invalidate : 1;
442          /* whether a draw occurs */
443          bool has_draw : 1;
444          /* whether a framebuffer resolve occurs on cbuf[0] */
445          bool has_resolve : 1;
446          /* whether queries are ended during this renderpass */
447          bool has_query_ends : 1;
448          uint8_t pad : 1;
449          /* 32 bits offset */
450          /* bitmask of color buffers using fbfetch */
451          uint8_t cbuf_fbfetch;
452          /* whether the fragment shader writes to the zsbuf */
453          bool zsbuf_write_fs : 1;
454          /* whether the DSA state writes to the zsbuf */
455          bool zsbuf_write_dsa : 1;
456          /* whether the DSA state reads the zsbuf */
457          bool zsbuf_read_dsa : 1;
458          /* whether the zsbuf is used for fbfetch */
459          bool zsbuf_fbfetch : 1;
460          uint8_t pad2 : 4;
461          uint16_t pad3;
462       };
463       uint64_t data;
464       /* fb info is in data32[0] */
465       uint32_t data32[2];
466       /* cso info is in data16[2] */
467       uint16_t data16[4];
468       /* zsbuf fb info is in data8[3] */
469       uint8_t data8[8];
470    };
471 };
472 
473 static inline bool
tc_renderpass_info_is_zsbuf_used(const struct tc_renderpass_info * info)474 tc_renderpass_info_is_zsbuf_used(const struct tc_renderpass_info *info)
475 {
476    return info->zsbuf_clear ||
477           info->zsbuf_clear_partial ||
478           info->zsbuf_write_fs ||
479           info->zsbuf_write_dsa ||
480           info->zsbuf_read_dsa ||
481           info->zsbuf_fbfetch;
482 }
483 
484 /* if a driver ends a renderpass early for some reason,
485  * this function can be called to reset any stored renderpass info
486  * to a "safe" state that will avoid data loss on framebuffer attachments
487  *
488  * note: ending a renderpass early if invalidate hints are applied will
489  * result in data loss
490  */
491 static inline void
tc_renderpass_info_reset(struct tc_renderpass_info * info)492 tc_renderpass_info_reset(struct tc_renderpass_info *info)
493 {
494    info->data32[0] = 0;
495    info->cbuf_load = BITFIELD_MASK(8);
496    info->zsbuf_clear_partial = true;
497    info->has_draw = true;
498    info->has_query_ends = true;
499 }
500 
501 struct tc_batch {
502    struct threaded_context *tc;
503 #if !defined(NDEBUG) && TC_DEBUG >= 1
504    unsigned sentinel;
505 #endif
506    uint16_t num_total_slots;
507    uint16_t buffer_list_index;
508    /* the index of the current renderpass info for recording */
509    int16_t renderpass_info_idx;
510    uint16_t max_renderpass_info_idx;
511 
512    /* The last mergeable call that was added to this batch (i.e.
513     * buffer subdata). This might be out-of-date or NULL.
514     */
515    struct tc_call_base *last_mergeable_call;
516 
517    struct util_queue_fence fence;
518    /* whether the first set_framebuffer_state call has been seen by this batch */
519    bool first_set_fb;
520    uint8_t batch_idx;
521    struct tc_unflushed_batch_token *token;
522    uint64_t slots[TC_SLOTS_PER_BATCH];
523    struct util_dynarray renderpass_infos;
524 };
525 
526 struct tc_buffer_list {
527    /* Signalled by the driver after it flushes its internal command buffer. */
528    struct util_queue_fence driver_flushed_fence;
529 
530    /* Buffer list where bit N means whether ID hash N is in the list. */
531    BITSET_DECLARE(buffer_list, TC_BUFFER_ID_MASK + 1);
532 };
533 
534 /**
535  * Optional TC parameters/callbacks.
536  */
537 struct threaded_context_options {
538    tc_create_fence_func create_fence;
539    tc_is_resource_busy is_resource_busy;
540    bool driver_calls_flush_notify;
541 
542    /**
543     * If true, ctx->get_device_reset_status() will be called without
544     * synchronizing with driver thread.  Drivers can enable this to avoid
545     * TC syncs if their implementation of get_device_reset_status() is
546     * safe to call without synchronizing with driver thread.
547     */
548    bool unsynchronized_get_device_reset_status;
549 
550    /* If true, create_fence_fd doesn't access the context in the driver. */
551    bool unsynchronized_create_fence_fd;
552    /* if true, texture_subdata calls may occur unsynchronized with PIPE_MAP_UNSYNCHRONIZED */
553    bool unsynchronized_texture_subdata;
554    /* if true, parse and track renderpass info during execution */
555    bool parse_renderpass_info;
556    /* callbacks for drivers to read their DSA/FS state and update renderpass info accordingly
557     * note: drivers must ONLY append to renderpass info using |=
558     */
559    void (*dsa_parse)(void *state, struct tc_renderpass_info *info);
560    void (*fs_parse)(void *state, struct tc_renderpass_info *info);
561 };
562 
563 struct tc_vertex_buffers {
564    struct tc_call_base base;
565    uint8_t count;
566    struct pipe_vertex_buffer slot[0]; /* more will be allocated if needed */
567 };
568 
569 struct threaded_context {
570    struct pipe_context base;
571    struct pipe_context *pipe;
572    struct slab_child_pool pool_transfers;
573    tc_replace_buffer_storage_func replace_buffer_storage;
574    struct threaded_context_options options;
575    unsigned map_buffer_alignment;
576    unsigned ubo_alignment;
577 
578    struct list_head unflushed_queries;
579 
580    /* Counters for the HUD. */
581    unsigned num_offloaded_slots;
582    unsigned num_direct_slots;
583    unsigned num_syncs;
584 
585    bool use_forced_staging_uploads;
586    bool add_all_gfx_bindings_to_buffer_list;
587    bool add_all_compute_bindings_to_buffer_list;
588    uint8_t num_queries_active;
589 
590    /* Estimation of how much vram/gtt bytes are mmap'd in
591     * the current tc_batch.
592     */
593    uint64_t bytes_mapped_estimate;
594    uint64_t bytes_mapped_limit;
595 
596    /* Estimation of how replacement buffer bytes are in
597     * the current tc_batch.
598     */
599    uint64_t bytes_replaced_estimate;
600    uint64_t bytes_replaced_limit;
601 
602    struct util_queue queue;
603    struct util_queue_fence *fence;
604 
605 #ifndef NDEBUG
606    /**
607     * The driver thread is normally the queue thread, but
608     * there are cases where the queue is flushed directly
609     * from the frontend thread
610     */
611    thrd_t driver_thread;
612 #endif
613 
614    bool seen_tcs;
615    bool seen_tes;
616    bool seen_gs;
617    /* whether the current renderpass has seen a set_framebuffer_state call */
618    bool seen_fb_state;
619    /* whether a renderpass is currently active */
620    bool in_renderpass;
621    /* whether a query has ended more recently than a draw */
622    bool query_ended;
623    /* whether pipe_context::flush has been called */
624    bool flushing;
625 
626    bool seen_streamout_buffers;
627    bool seen_shader_buffers[PIPE_SHADER_TYPES];
628    bool seen_image_buffers[PIPE_SHADER_TYPES];
629    bool seen_sampler_buffers[PIPE_SHADER_TYPES];
630 
631    int8_t last_completed;
632 
633    uint8_t num_vertex_buffers;
634    unsigned max_const_buffers;
635    unsigned max_shader_buffers;
636    unsigned max_images;
637    unsigned max_samplers;
638    unsigned nr_cbufs;
639 
640    unsigned last, next, next_buf_list, batch_generation;
641 
642    /* The list fences that the driver should signal after the next flush.
643     * If this is empty, all driver command buffers have been flushed.
644     */
645    struct util_queue_fence *signal_fences_next_flush[TC_MAX_BUFFER_LISTS];
646    unsigned num_signal_fences_next_flush;
647 
648    /* Bound buffers are tracked here using threaded_resource::buffer_id_hash.
649     * 0 means unbound.
650     */
651    uint32_t vertex_buffers[PIPE_MAX_ATTRIBS];
652    uint32_t streamout_buffers[PIPE_MAX_SO_BUFFERS];
653    uint32_t const_buffers[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
654    uint32_t shader_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS];
655    uint32_t image_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
656    uint32_t shader_buffers_writeable_mask[PIPE_SHADER_TYPES];
657    uint64_t image_buffers_writeable_mask[PIPE_SHADER_TYPES];
658    uint32_t sampler_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
659 
660    struct tc_batch batch_slots[TC_MAX_BATCHES];
661    struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS];
662    /* the current framebuffer attachments; [PIPE_MAX_COLOR_BUFS] is the zsbuf */
663    struct pipe_resource *fb_resources[PIPE_MAX_COLOR_BUFS + 1];
664    struct pipe_resource *fb_resolve;
665    /* accessed by main thread; preserves info across batches */
666    struct tc_renderpass_info *renderpass_info_recording;
667    /* accessed by driver thread */
668    struct tc_renderpass_info *renderpass_info;
669 };
670 
671 
672 void threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage);
673 void threaded_resource_deinit(struct pipe_resource *res);
674 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
675 void tc_driver_internal_flush_notify(struct threaded_context *tc);
676 
677 /** function for getting the current renderpass info:
678  * - renderpass info is always non-null
679  *
680  * Rules:
681  * - threaded context must have been created with parse_renderpass_info=true
682  * - must be called after the driver receives a pipe_context::set_framebuffer_state callback
683  * - must be called after the driver receives a non-deferrable pipe_context::flush callback
684  * - renderpass info must not be used during any internal driver operations (e.g., u_blitter)
685  * - must not be called before the driver receives its first pipe_context::set_framebuffer_state callback
686  * - renderpass info is invalidated only for non-deferrable flushes and new framebuffer states
687  */
688 const struct tc_renderpass_info *
689 threaded_context_get_renderpass_info(struct threaded_context *tc);
690 
691 struct pipe_context *
692 threaded_context_create(struct pipe_context *pipe,
693                         struct slab_parent_pool *parent_transfer_pool,
694                         tc_replace_buffer_storage_func replace_buffer,
695                         const struct threaded_context_options *options,
696                         struct threaded_context **out);
697 
698 void
699 threaded_context_init_bytes_mapped_limit(struct threaded_context *tc, unsigned divisor);
700 
701 void
702 threaded_context_flush(struct pipe_context *_pipe,
703                        struct tc_unflushed_batch_token *token,
704                        bool prefer_async);
705 
706 struct tc_draw_single *
707 tc_add_draw_single_call(struct pipe_context *_pipe,
708                         struct pipe_resource *index_bo);
709 struct pipe_vertex_buffer *
710 tc_add_set_vertex_buffers_call(struct pipe_context *_pipe, unsigned count);
711 
712 void
713 tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
714             unsigned drawid_offset,
715             const struct pipe_draw_indirect_info *indirect,
716             const struct pipe_draw_start_count_bias *draws,
717             unsigned num_draws);
718 
719 static inline struct threaded_context *
threaded_context(struct pipe_context * pipe)720 threaded_context(struct pipe_context *pipe)
721 {
722    return (struct threaded_context*)pipe;
723 }
724 
725 static inline struct threaded_resource *
threaded_resource(struct pipe_resource * res)726 threaded_resource(struct pipe_resource *res)
727 {
728    return (struct threaded_resource*)res;
729 }
730 
731 static inline struct threaded_query *
threaded_query(struct pipe_query * q)732 threaded_query(struct pipe_query *q)
733 {
734    return (struct threaded_query*)q;
735 }
736 
737 static inline struct threaded_transfer *
threaded_transfer(struct pipe_transfer * transfer)738 threaded_transfer(struct pipe_transfer *transfer)
739 {
740    return (struct threaded_transfer*)transfer;
741 }
742 
743 static inline void
tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token ** dst,struct tc_unflushed_batch_token * src)744 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
745                                    struct tc_unflushed_batch_token *src)
746 {
747    if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src))
748       free(*dst);
749    *dst = src;
750 }
751 
752 /**
753  * Helper for !NDEBUG builds to assert that it is called from driver
754  * thread.  This is to help drivers ensure that various code-paths
755  * are not hit indirectly from pipe entry points that are called from
756  * front-end/state-tracker thread.
757  */
758 static inline void
tc_assert_driver_thread(struct threaded_context * tc)759 tc_assert_driver_thread(struct threaded_context *tc)
760 {
761    if (!tc)
762       return;
763 #ifndef NDEBUG
764    assert(u_thread_is_self(tc->driver_thread));
765 #endif
766 }
767 
768 /**
769  * This is called before GPU stores to disable the CPU storage because
770  * the CPU storage doesn't mirror the GPU storage.
771  *
772  * Drivers should also call it before exporting a DMABUF of a buffer.
773  */
774 static inline void
tc_buffer_disable_cpu_storage(struct pipe_resource * buf)775 tc_buffer_disable_cpu_storage(struct pipe_resource *buf)
776 {
777    struct threaded_resource *tres = threaded_resource(buf);
778 
779    if (tres->cpu_storage) {
780       align_free(tres->cpu_storage);
781       tres->cpu_storage = NULL;
782    }
783    tres->allow_cpu_storage = false;
784 }
785 
786 static inline void
tc_buffer_write(struct pipe_context * pipe,struct pipe_resource * buf,unsigned offset,unsigned size,const void * data)787 tc_buffer_write(struct pipe_context *pipe,
788                 struct pipe_resource *buf,
789                 unsigned offset,
790                 unsigned size,
791                 const void *data)
792 {
793    pipe->buffer_subdata(pipe, buf, PIPE_MAP_WRITE | TC_TRANSFER_MAP_NO_INVALIDATE, offset, size, data);
794 }
795 
796 static inline struct tc_buffer_list *
tc_get_next_buffer_list(struct pipe_context * _pipe)797 tc_get_next_buffer_list(struct pipe_context *_pipe)
798 {
799    struct threaded_context *tc = threaded_context(_pipe);
800 
801    return &tc->buffer_lists[tc->next_buf_list];
802 }
803 
804 /* Set a buffer binding and add it to the buffer list. */
805 static inline void
tc_bind_buffer(uint32_t * binding,struct tc_buffer_list * next,struct pipe_resource * buf)806 tc_bind_buffer(uint32_t *binding, struct tc_buffer_list *next, struct pipe_resource *buf)
807 {
808    uint32_t id = threaded_resource(buf)->buffer_id_unique;
809    *binding = id;
810    BITSET_SET(next->buffer_list, id & TC_BUFFER_ID_MASK);
811 }
812 
813 /* Reset a buffer binding. */
814 static inline void
tc_unbind_buffer(uint32_t * binding)815 tc_unbind_buffer(uint32_t *binding)
816 {
817    *binding = 0;
818 }
819 
820 static inline void
tc_track_vertex_buffer(struct pipe_context * _pipe,unsigned index,struct pipe_resource * buf,struct tc_buffer_list * next_buffer_list)821 tc_track_vertex_buffer(struct pipe_context *_pipe, unsigned index,
822                          struct pipe_resource *buf,
823                          struct tc_buffer_list *next_buffer_list)
824 {
825    struct threaded_context *tc = threaded_context(_pipe);
826 
827    if (buf) {
828       tc_bind_buffer(&tc->vertex_buffers[index], next_buffer_list, buf);
829    } else {
830       tc_unbind_buffer(&tc->vertex_buffers[index]);
831    }
832 }
833 
834 #ifdef __cplusplus
835 }
836 #endif
837 
838 #endif
839