1 /**************************************************************************
2 *
3 * Copyright 2017 Advanced Micro Devices, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * on the rights to use, copy, modify, merge, publish, distribute, sub
10 * license, and/or sell copies of the Software, and to permit persons to whom
11 * the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 *
25 **************************************************************************/
26
27 #include "util/u_threaded_context.h"
28 #include "util/u_cpu_detect.h"
29 #include "util/format/u_format.h"
30 #include "util/u_inlines.h"
31 #include "util/u_memory.h"
32 #include "util/u_upload_mgr.h"
33 #include "driver_trace/tr_context.h"
34 #include "util/log.h"
35 #include "util/perf/cpu_trace.h"
36 #include "util/thread_sched.h"
37 #include "compiler/shader_info.h"
38
39 #if TC_DEBUG >= 1
40 #define tc_assert assert
41 #else
42 #define tc_assert(x)
43 #endif
44
45 #if TC_DEBUG >= 2
46 #define tc_printf mesa_logi
47 #define tc_asprintf asprintf
48 #define tc_strcmp strcmp
49 #else
50 #define tc_printf(...)
51 #define tc_asprintf(...) 0
52 #define tc_strcmp(...) 0
53 #endif
54
55 #define TC_SENTINEL 0x5ca1ab1e
56
57 #if TC_DEBUG >= 3 || defined(TC_TRACE)
58 static const char *tc_call_names[] = {
59 #define CALL(name) #name,
60 #include "u_threaded_context_calls.h"
61 #undef CALL
62 };
63 #endif
64
65 #ifdef TC_TRACE
66 # define TC_TRACE_SCOPE(call_id) MESA_TRACE_SCOPE(tc_call_names[call_id])
67 #else
68 # define TC_TRACE_SCOPE(call_id)
69 #endif
70
71 static void
72 tc_buffer_subdata(struct pipe_context *_pipe,
73 struct pipe_resource *resource,
74 unsigned usage, unsigned offset,
75 unsigned size, const void *data);
76
77 static void
tc_batch_check(UNUSED struct tc_batch * batch)78 tc_batch_check(UNUSED struct tc_batch *batch)
79 {
80 tc_assert(batch->sentinel == TC_SENTINEL);
81 tc_assert(batch->num_total_slots <= TC_SLOTS_PER_BATCH);
82 }
83
84 static void
tc_debug_check(struct threaded_context * tc)85 tc_debug_check(struct threaded_context *tc)
86 {
87 for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
88 tc_batch_check(&tc->batch_slots[i]);
89 tc_assert(tc->batch_slots[i].tc == tc);
90 }
91 }
92
93 static void
tc_set_driver_thread(struct threaded_context * tc)94 tc_set_driver_thread(struct threaded_context *tc)
95 {
96 #ifndef NDEBUG
97 tc->driver_thread = thrd_current();
98 #endif
99 }
100
101 static void
tc_clear_driver_thread(struct threaded_context * tc)102 tc_clear_driver_thread(struct threaded_context *tc)
103 {
104 #ifndef NDEBUG
105 memset(&tc->driver_thread, 0, sizeof(tc->driver_thread));
106 #endif
107 }
108
109 struct tc_batch_rp_info {
110 /* this is what drivers can see */
111 struct tc_renderpass_info info;
112 /* determines whether the info can be "safely" read by drivers or if it may still be in use */
113 struct util_queue_fence ready;
114 /* when a batch is full, the rp info rollsover onto 'next' */
115 struct tc_batch_rp_info *next;
116 /* when rp info has rolled over onto this struct, 'prev' is used to update pointers for realloc */
117 struct tc_batch_rp_info *prev;
118 };
119
120 static struct tc_batch_rp_info *
tc_batch_rp_info(struct tc_renderpass_info * info)121 tc_batch_rp_info(struct tc_renderpass_info *info)
122 {
123 return (struct tc_batch_rp_info *)info;
124 }
125
126 static void
tc_sanitize_renderpass_info(struct threaded_context * tc)127 tc_sanitize_renderpass_info(struct threaded_context *tc)
128 {
129 tc->renderpass_info_recording->cbuf_invalidate = 0;
130 tc->renderpass_info_recording->zsbuf_invalidate = false;
131 tc->renderpass_info_recording->cbuf_load |= (~tc->renderpass_info_recording->cbuf_clear) & BITFIELD_MASK(PIPE_MAX_COLOR_BUFS);
132 if (tc->fb_resources[PIPE_MAX_COLOR_BUFS] && !tc_renderpass_info_is_zsbuf_used(tc->renderpass_info_recording))
133 /* this should be a "safe" way to indicate to the driver that both loads and stores are required;
134 * driver can always detect invalidation
135 */
136 tc->renderpass_info_recording->zsbuf_clear_partial = true;
137 if (tc->num_queries_active)
138 tc->renderpass_info_recording->has_query_ends = true;
139 }
140
141 /* ensure the batch's array of renderpass data is large enough for the current index */
142 static void
tc_batch_renderpass_infos_resize(struct threaded_context * tc,struct tc_batch * batch)143 tc_batch_renderpass_infos_resize(struct threaded_context *tc, struct tc_batch *batch)
144 {
145 unsigned size = batch->renderpass_infos.capacity;
146 unsigned cur_num = MAX2(batch->renderpass_info_idx, 0);
147
148 if (size / sizeof(struct tc_batch_rp_info) > cur_num)
149 return;
150
151 struct tc_batch_rp_info *infos = batch->renderpass_infos.data;
152 unsigned old_idx = batch->renderpass_info_idx - 1;
153 bool redo = tc->renderpass_info_recording &&
154 tc->renderpass_info_recording == &infos[old_idx].info;
155 if (!util_dynarray_resize(&batch->renderpass_infos, struct tc_batch_rp_info, cur_num + 10))
156 mesa_loge("tc: memory alloc fail!");
157
158 if (size != batch->renderpass_infos.capacity) {
159 /* zero new allocation region */
160 uint8_t *data = batch->renderpass_infos.data;
161 memset(data + size, 0, batch->renderpass_infos.capacity - size);
162 unsigned start = size / sizeof(struct tc_batch_rp_info);
163 unsigned count = (batch->renderpass_infos.capacity - size) /
164 sizeof(struct tc_batch_rp_info);
165 infos = batch->renderpass_infos.data;
166 if (infos->prev)
167 infos->prev->next = infos;
168 for (unsigned i = 0; i < count; i++)
169 util_queue_fence_init(&infos[start + i].ready);
170 /* re-set current recording info on resize */
171 if (redo)
172 tc->renderpass_info_recording = &infos[old_idx].info;
173 }
174 }
175
176 /* signal that the renderpass info is "ready" for use by drivers and will no longer be updated */
177 static void
tc_signal_renderpass_info_ready(struct threaded_context * tc)178 tc_signal_renderpass_info_ready(struct threaded_context *tc)
179 {
180 if (tc->renderpass_info_recording &&
181 !util_queue_fence_is_signalled(&tc_batch_rp_info(tc->renderpass_info_recording)->ready))
182 util_queue_fence_signal(&tc_batch_rp_info(tc->renderpass_info_recording)->ready);
183 }
184
185 /* increment the current renderpass info struct for recording
186 * 'full_copy' is used for preserving data across non-blocking tc batch flushes
187 */
188 static void
tc_batch_increment_renderpass_info(struct threaded_context * tc,unsigned batch_idx,bool full_copy)189 tc_batch_increment_renderpass_info(struct threaded_context *tc, unsigned batch_idx, bool full_copy)
190 {
191 struct tc_batch *batch = &tc->batch_slots[batch_idx];
192 struct tc_batch_rp_info *tc_info = batch->renderpass_infos.data;
193
194 if (tc_info[0].next || batch->num_total_slots) {
195 /* deadlock condition detected: all batches are in flight, renderpass hasn't ended
196 * (probably a cts case)
197 */
198 struct tc_batch_rp_info *info = tc_batch_rp_info(tc->renderpass_info_recording);
199 if (!util_queue_fence_is_signalled(&info->ready)) {
200 /* this batch is actively executing and the driver is waiting on the recording fence to signal */
201 /* force all buffer usage to avoid data loss */
202 info->info.cbuf_load = ~(BITFIELD_MASK(8) & info->info.cbuf_clear);
203 info->info.zsbuf_clear_partial = true;
204 info->info.has_query_ends = tc->num_queries_active > 0;
205 /* ensure threaded_context_get_renderpass_info() won't deadlock */
206 info->next = NULL;
207 util_queue_fence_signal(&info->ready);
208 }
209 /* always wait on the batch to finish since this will otherwise overwrite thread data */
210 util_queue_fence_wait(&batch->fence);
211 }
212 /* increment rp info and initialize it */
213 batch->renderpass_info_idx++;
214 tc_batch_renderpass_infos_resize(tc, batch);
215 tc_info = batch->renderpass_infos.data;
216
217 if (full_copy) {
218 /* this should only be called when changing batches */
219 assert(batch->renderpass_info_idx == 0);
220 /* copy the previous data in its entirety: this is still the same renderpass */
221 if (tc->renderpass_info_recording) {
222 tc_info[batch->renderpass_info_idx].info.data = tc->renderpass_info_recording->data;
223 tc_batch_rp_info(tc->renderpass_info_recording)->next = &tc_info[batch->renderpass_info_idx];
224 tc_info[batch->renderpass_info_idx].prev = tc_batch_rp_info(tc->renderpass_info_recording);
225 /* guard against deadlock scenario */
226 assert(&tc_batch_rp_info(tc->renderpass_info_recording)->next->info != tc->renderpass_info_recording);
227 } else {
228 tc_info[batch->renderpass_info_idx].info.data = 0;
229 tc_info[batch->renderpass_info_idx].prev = NULL;
230 }
231 } else {
232 /* selectively copy: only the CSO metadata is copied, and a new framebuffer state will be added later */
233 tc_info[batch->renderpass_info_idx].info.data = 0;
234 if (tc->renderpass_info_recording) {
235 tc_info[batch->renderpass_info_idx].info.data16[2] = tc->renderpass_info_recording->data16[2];
236 tc_batch_rp_info(tc->renderpass_info_recording)->next = NULL;
237 tc_info[batch->renderpass_info_idx].prev = NULL;
238 }
239 }
240
241 assert(!full_copy || !tc->renderpass_info_recording || tc_batch_rp_info(tc->renderpass_info_recording)->next);
242 /* signal existing info since it will not be used anymore */
243 tc_signal_renderpass_info_ready(tc);
244 util_queue_fence_reset(&tc_info[batch->renderpass_info_idx].ready);
245 /* guard against deadlock scenario */
246 assert(tc->renderpass_info_recording != &tc_info[batch->renderpass_info_idx].info);
247 /* this is now the current recording renderpass info */
248 tc->renderpass_info_recording = &tc_info[batch->renderpass_info_idx].info;
249 batch->max_renderpass_info_idx = batch->renderpass_info_idx;
250 }
251
252 static ALWAYS_INLINE struct tc_renderpass_info *
tc_get_renderpass_info(struct threaded_context * tc)253 tc_get_renderpass_info(struct threaded_context *tc)
254 {
255 return tc->renderpass_info_recording;
256 }
257
258 /* update metadata at draw time */
259 static void
tc_parse_draw(struct threaded_context * tc)260 tc_parse_draw(struct threaded_context *tc)
261 {
262 struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
263
264 if (info) {
265 /* all buffers that aren't cleared are considered loaded */
266 info->cbuf_load |= ~info->cbuf_clear;
267 if (!info->zsbuf_clear)
268 info->zsbuf_load = true;
269 /* previous invalidates are no longer relevant */
270 info->cbuf_invalidate = 0;
271 info->zsbuf_invalidate = false;
272 info->has_draw = true;
273 info->has_query_ends |= tc->query_ended;
274 }
275
276 tc->in_renderpass = true;
277 tc->seen_fb_state = true;
278 tc->query_ended = false;
279 }
280
281 static void *
to_call_check(void * ptr,unsigned num_slots)282 to_call_check(void *ptr, unsigned num_slots)
283 {
284 #if TC_DEBUG >= 1
285 struct tc_call_base *call = ptr;
286 tc_assert(call->num_slots == num_slots);
287 #endif
288 return ptr;
289 }
290 #define to_call(ptr, type) ((struct type *)to_call_check((void *)(ptr), call_size(type)))
291
292 #define size_to_slots(size) DIV_ROUND_UP(size, 8)
293 #define call_size(type) size_to_slots(sizeof(struct type))
294 #define call_size_with_slots(type, num_slots) size_to_slots( \
295 sizeof(struct type) + sizeof(((struct type*)NULL)->slot[0]) * (num_slots))
296 #define get_next_call(ptr, type) ((struct type*)((uint64_t*)ptr + call_size(type)))
297
298 ALWAYS_INLINE static void
tc_set_resource_batch_usage(struct threaded_context * tc,struct pipe_resource * pres)299 tc_set_resource_batch_usage(struct threaded_context *tc, struct pipe_resource *pres)
300 {
301 /* ignore batch usage when persistent */
302 if (threaded_resource(pres)->last_batch_usage != INT8_MAX)
303 threaded_resource(pres)->last_batch_usage = tc->next;
304 threaded_resource(pres)->batch_generation = tc->batch_generation;
305 }
306
307 ALWAYS_INLINE static void
tc_set_resource_batch_usage_persistent(struct threaded_context * tc,struct pipe_resource * pres,bool enable)308 tc_set_resource_batch_usage_persistent(struct threaded_context *tc, struct pipe_resource *pres, bool enable)
309 {
310 if (!pres)
311 return;
312 /* mark with special value to block any unsynchronized access */
313 threaded_resource(pres)->last_batch_usage = enable ? INT8_MAX : tc->next;
314 threaded_resource(pres)->batch_generation = tc->batch_generation;
315 }
316
317 /* this can ONLY be used to check against the currently recording batch */
318 ALWAYS_INLINE static bool
tc_resource_batch_usage_test_busy(const struct threaded_context * tc,const struct pipe_resource * pres)319 tc_resource_batch_usage_test_busy(const struct threaded_context *tc, const struct pipe_resource *pres)
320 {
321 const struct threaded_resource *tbuf = (const struct threaded_resource*)pres;
322
323 if (!tc->options.unsynchronized_texture_subdata)
324 return true;
325
326 /* resource has persistent access: assume always busy */
327 if (tbuf->last_batch_usage == INT8_MAX)
328 return true;
329
330 /* resource has never been seen */
331 if (tbuf->last_batch_usage == -1)
332 return false;
333
334 /* resource has been seen but no batches have executed */
335 if (tc->last_completed == -1)
336 return true;
337
338 /* begin comparisons checking number of times batches have cycled */
339 unsigned diff = tc->batch_generation - tbuf->batch_generation;
340 /* resource has been seen, batches have fully cycled at least once */
341 if (diff > 1)
342 return false;
343
344 /* resource has been seen in current batch cycle: return whether batch has definitely completed */
345 if (diff == 0)
346 return tc->last_completed >= tbuf->last_batch_usage;
347
348 /* resource has been seen within one batch cycle: check for batch wrapping */
349 if (tc->last_completed >= tbuf->last_batch_usage)
350 /* this or a subsequent pre-wrap batch was the last to definitely complete: resource is idle */
351 return false;
352
353 /* batch execution has not definitely wrapped: resource is definitely not idle */
354 if (tc->last_completed > tc->next)
355 return true;
356
357 /* resource was seen pre-wrap, batch execution has definitely wrapped: idle */
358 if (tbuf->last_batch_usage > tc->last_completed)
359 return false;
360
361 /* tc->last_completed is not an exact measurement, so anything else is considered busy */
362 return true;
363 }
364
365 /* Assign src to dst while dst is uninitialized. */
366 static inline void
tc_set_resource_reference(struct pipe_resource ** dst,struct pipe_resource * src)367 tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource *src)
368 {
369 *dst = src;
370 pipe_reference(NULL, &src->reference); /* only increment refcount */
371 }
372
373 /* Assign src to dst while dst is uninitialized. */
374 static inline void
tc_set_vertex_state_reference(struct pipe_vertex_state ** dst,struct pipe_vertex_state * src)375 tc_set_vertex_state_reference(struct pipe_vertex_state **dst,
376 struct pipe_vertex_state *src)
377 {
378 *dst = src;
379 pipe_reference(NULL, &src->reference); /* only increment refcount */
380 }
381
382 /* Unreference dst but don't touch the dst pointer. */
383 static inline void
tc_drop_resource_reference(struct pipe_resource * dst)384 tc_drop_resource_reference(struct pipe_resource *dst)
385 {
386 if (pipe_reference(&dst->reference, NULL)) /* only decrement refcount */
387 pipe_resource_destroy(dst);
388 }
389
390 /* Unreference dst but don't touch the dst pointer. */
391 static inline void
tc_drop_surface_reference(struct pipe_surface * dst)392 tc_drop_surface_reference(struct pipe_surface *dst)
393 {
394 if (pipe_reference(&dst->reference, NULL)) /* only decrement refcount */
395 dst->context->surface_destroy(dst->context, dst);
396 }
397
398 /* Unreference dst but don't touch the dst pointer. */
399 static inline void
tc_drop_so_target_reference(struct pipe_stream_output_target * dst)400 tc_drop_so_target_reference(struct pipe_stream_output_target *dst)
401 {
402 if (pipe_reference(&dst->reference, NULL)) /* only decrement refcount */
403 dst->context->stream_output_target_destroy(dst->context, dst);
404 }
405
406 /**
407 * Subtract the given number of references.
408 */
409 static inline void
tc_drop_vertex_state_references(struct pipe_vertex_state * dst,int num_refs)410 tc_drop_vertex_state_references(struct pipe_vertex_state *dst, int num_refs)
411 {
412 int count = p_atomic_add_return(&dst->reference.count, -num_refs);
413
414 assert(count >= 0);
415 /* Underflows shouldn't happen, but let's be safe. */
416 if (count <= 0)
417 dst->screen->vertex_state_destroy(dst->screen, dst);
418 }
419
420 /* We don't want to read or write min_index and max_index, because
421 * it shouldn't be needed by drivers at this point.
422 */
423 #define DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX \
424 offsetof(struct pipe_draw_info, min_index)
425
426 ALWAYS_INLINE static struct tc_renderpass_info *
incr_rp_info(struct tc_renderpass_info * tc_info)427 incr_rp_info(struct tc_renderpass_info *tc_info)
428 {
429 struct tc_batch_rp_info *info = tc_batch_rp_info(tc_info);
430 return &info[1].info;
431 }
432
433 ALWAYS_INLINE static void
batch_execute(struct tc_batch * batch,struct pipe_context * pipe,uint64_t * last,bool parsing)434 batch_execute(struct tc_batch *batch, struct pipe_context *pipe, uint64_t *last, bool parsing)
435 {
436 /* if the framebuffer state is persisting from a previous batch,
437 * begin incrementing renderpass info on the first set_framebuffer_state call
438 */
439 bool first = !batch->first_set_fb;
440 const tc_execute *execute_func = batch->tc->execute_func;
441
442 for (uint64_t *iter = batch->slots; iter != last;) {
443 struct tc_call_base *call = (struct tc_call_base *)iter;
444
445 tc_assert(call->sentinel == TC_SENTINEL);
446
447 #if TC_DEBUG >= 3
448 tc_printf("CALL: %s", tc_call_names[call->call_id]);
449 #endif
450
451 TC_TRACE_SCOPE(call->call_id);
452
453 iter += execute_func[call->call_id](pipe, call);
454
455 if (parsing) {
456 if (call->call_id == TC_CALL_flush) {
457 /* always increment renderpass info for non-deferred flushes */
458 batch->tc->renderpass_info = incr_rp_info(batch->tc->renderpass_info);
459 /* if a flush happens, renderpass info is always incremented after */
460 first = false;
461 } else if (call->call_id == TC_CALL_set_framebuffer_state) {
462 /* the renderpass info pointer is already set at the start of the batch,
463 * so don't increment on the first set_framebuffer_state call
464 */
465 if (!first)
466 batch->tc->renderpass_info = incr_rp_info(batch->tc->renderpass_info);
467 first = false;
468 } else if (call->call_id >= TC_CALL_draw_single &&
469 call->call_id <= TC_CALL_draw_vstate_multi) {
470 /* if a draw happens before a set_framebuffer_state on this batch,
471 * begin incrementing renderpass data
472 */
473 first = false;
474 }
475 }
476 }
477 }
478
479 static void
tc_batch_execute(void * job,UNUSED void * gdata,int thread_index)480 tc_batch_execute(void *job, UNUSED void *gdata, int thread_index)
481 {
482 struct tc_batch *batch = job;
483 struct pipe_context *pipe = batch->tc->pipe;
484 uint64_t *last = &batch->slots[batch->num_total_slots];
485
486 tc_batch_check(batch);
487 tc_set_driver_thread(batch->tc);
488
489 assert(!batch->token);
490
491 /* setup renderpass info */
492 batch->tc->renderpass_info = batch->renderpass_infos.data;
493
494 if (batch->tc->options.parse_renderpass_info) {
495 batch_execute(batch, pipe, last, true);
496
497 struct tc_batch_rp_info *info = batch->renderpass_infos.data;
498 for (unsigned i = 0; i < batch->max_renderpass_info_idx + 1; i++) {
499 if (info[i].next)
500 info[i].next->prev = NULL;
501 info[i].next = NULL;
502 }
503 } else {
504 batch_execute(batch, pipe, last, false);
505 }
506
507 /* Add the fence to the list of fences for the driver to signal at the next
508 * flush, which we use for tracking which buffers are referenced by
509 * an unflushed command buffer.
510 */
511 struct threaded_context *tc = batch->tc;
512 struct util_queue_fence *fence =
513 &tc->buffer_lists[batch->buffer_list_index].driver_flushed_fence;
514
515 if (tc->options.driver_calls_flush_notify) {
516 tc->signal_fences_next_flush[tc->num_signal_fences_next_flush++] = fence;
517
518 /* Since our buffer lists are chained as a ring, we need to flush
519 * the context twice as we go around the ring to make the driver signal
520 * the buffer list fences, so that the producer thread can reuse the buffer
521 * list structures for the next batches without waiting.
522 */
523 unsigned half_ring = TC_MAX_BUFFER_LISTS / 2;
524 if (batch->buffer_list_index % half_ring == half_ring - 1)
525 pipe->flush(pipe, NULL, PIPE_FLUSH_ASYNC);
526 } else {
527 util_queue_fence_signal(fence);
528 }
529
530 tc_clear_driver_thread(batch->tc);
531 tc_batch_check(batch);
532 batch->num_total_slots = 0;
533 batch->last_mergeable_call = NULL;
534 batch->first_set_fb = false;
535 batch->max_renderpass_info_idx = 0;
536 batch->tc->last_completed = batch->batch_idx;
537 }
538
539 static void
tc_begin_next_buffer_list(struct threaded_context * tc)540 tc_begin_next_buffer_list(struct threaded_context *tc)
541 {
542 tc->next_buf_list = (tc->next_buf_list + 1) % TC_MAX_BUFFER_LISTS;
543
544 tc->batch_slots[tc->next].buffer_list_index = tc->next_buf_list;
545
546 /* Clear the buffer list in the new empty batch. */
547 struct tc_buffer_list *buf_list = &tc->buffer_lists[tc->next_buf_list];
548 assert(util_queue_fence_is_signalled(&buf_list->driver_flushed_fence));
549 util_queue_fence_reset(&buf_list->driver_flushed_fence); /* set to unsignalled */
550 BITSET_ZERO(buf_list->buffer_list);
551
552 tc->add_all_gfx_bindings_to_buffer_list = true;
553 tc->add_all_compute_bindings_to_buffer_list = true;
554 }
555
556 static void
tc_add_call_end(struct tc_batch * next)557 tc_add_call_end(struct tc_batch *next)
558 {
559 /* Add a dummy last call that won't be executed, but will indicate the end
560 * of the batch. It's for calls that always look at the next call and this
561 * stops them looking farther ahead.
562 */
563 assert(next->num_total_slots < TC_SLOTS_PER_BATCH);
564 struct tc_call_base *call =
565 (struct tc_call_base*)&next->slots[next->num_total_slots];
566 call->call_id = TC_NUM_CALLS;
567 call->num_slots = 1;
568 }
569
570 static void
tc_batch_flush(struct threaded_context * tc,bool full_copy)571 tc_batch_flush(struct threaded_context *tc, bool full_copy)
572 {
573 struct tc_batch *next = &tc->batch_slots[tc->next];
574 unsigned next_id = (tc->next + 1) % TC_MAX_BATCHES;
575
576 tc_assert(next->num_total_slots != 0);
577 tc_add_call_end(next);
578
579 tc_batch_check(next);
580 tc_debug_check(tc);
581 tc->bytes_mapped_estimate = 0;
582 tc->bytes_replaced_estimate = 0;
583 p_atomic_add(&tc->num_offloaded_slots, next->num_total_slots);
584
585 if (next->token) {
586 next->token->tc = NULL;
587 tc_unflushed_batch_token_reference(&next->token, NULL);
588 }
589 /* reset renderpass info index for subsequent use */
590 next->renderpass_info_idx = -1;
591
592 /* always increment renderpass info on batch flush;
593 * renderpass info can only be accessed by its owner batch during execution
594 */
595 if (tc->renderpass_info_recording) {
596 tc->batch_slots[next_id].first_set_fb = full_copy;
597 tc_batch_increment_renderpass_info(tc, next_id, full_copy);
598 }
599
600 util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
601 NULL, 0);
602 tc->last = tc->next;
603 tc->next = next_id;
604 if (next_id == 0)
605 tc->batch_generation++;
606 tc_begin_next_buffer_list(tc);
607
608 }
609
610 /* This is the function that adds variable-sized calls into the current
611 * batch. It also flushes the batch if there is not enough space there.
612 * All other higher-level "add" functions use it.
613 */
614 static void *
tc_add_sized_call(struct threaded_context * tc,enum tc_call_id id,unsigned num_slots)615 tc_add_sized_call(struct threaded_context *tc, enum tc_call_id id,
616 unsigned num_slots)
617 {
618 TC_TRACE_SCOPE(id);
619 struct tc_batch *next = &tc->batch_slots[tc->next];
620 assert(num_slots <= TC_SLOTS_PER_BATCH - 1);
621 tc_debug_check(tc);
622
623 if (unlikely(next->num_total_slots + num_slots > TC_SLOTS_PER_BATCH - 1)) {
624 /* copy existing renderpass info during flush */
625 tc_batch_flush(tc, true);
626 next = &tc->batch_slots[tc->next];
627 tc_assert(next->num_total_slots == 0);
628 tc_assert(next->last_mergeable_call == NULL);
629 }
630
631 tc_assert(util_queue_fence_is_signalled(&next->fence));
632
633 struct tc_call_base *call = (struct tc_call_base*)&next->slots[next->num_total_slots];
634 next->num_total_slots += num_slots;
635
636 #if !defined(NDEBUG) && TC_DEBUG >= 1
637 call->sentinel = TC_SENTINEL;
638 #endif
639 call->call_id = id;
640 call->num_slots = num_slots;
641
642 #if TC_DEBUG >= 3
643 tc_printf("ENQUEUE: %s", tc_call_names[id]);
644 #endif
645
646 tc_debug_check(tc);
647 return call;
648 }
649
650 #define tc_add_call(tc, execute, type) \
651 ((struct type*)tc_add_sized_call(tc, execute, call_size(type)))
652
653 #define tc_add_slot_based_call(tc, execute, type, num_slots) \
654 ((struct type*)tc_add_sized_call(tc, execute, \
655 call_size_with_slots(type, num_slots)))
656
657 /* Returns the last mergeable call that was added to the unflushed
658 * batch, or NULL if the address of that call is not currently known
659 * or no such call exists in the unflushed batch.
660 */
661 static struct tc_call_base *
tc_get_last_mergeable_call(struct threaded_context * tc)662 tc_get_last_mergeable_call(struct threaded_context *tc)
663 {
664 struct tc_batch *batch = &tc->batch_slots[tc->next];
665 struct tc_call_base *call = batch->last_mergeable_call;
666
667 tc_assert(call == NULL || call->num_slots <= batch->num_total_slots);
668
669 if (call && (uint64_t *)call == &batch->slots[batch->num_total_slots - call->num_slots])
670 return call;
671 else
672 return NULL;
673 }
674
675 /* Increases the size of the last call in the unflushed batch to the
676 * given number of slots, if possible, without changing the call's data.
677 */
678 static bool
tc_enlarge_last_mergeable_call(struct threaded_context * tc,unsigned desired_num_slots)679 tc_enlarge_last_mergeable_call(struct threaded_context *tc, unsigned desired_num_slots)
680 {
681 struct tc_batch *batch = &tc->batch_slots[tc->next];
682 struct tc_call_base *call = tc_get_last_mergeable_call(tc);
683
684 tc_assert(call);
685 tc_assert(desired_num_slots >= call->num_slots);
686
687 unsigned added_slots = desired_num_slots - call->num_slots;
688
689 if (unlikely(batch->num_total_slots + added_slots > TC_SLOTS_PER_BATCH - 1))
690 return false;
691
692 batch->num_total_slots += added_slots;
693 call->num_slots += added_slots;
694
695 return true;
696 }
697
698 static void
tc_mark_call_mergeable(struct threaded_context * tc,struct tc_call_base * call)699 tc_mark_call_mergeable(struct threaded_context *tc, struct tc_call_base *call)
700 {
701 struct tc_batch *batch = &tc->batch_slots[tc->next];
702 tc_assert(call->num_slots <= batch->num_total_slots);
703 tc_assert((uint64_t *)call == &batch->slots[batch->num_total_slots - call->num_slots]);
704 batch->last_mergeable_call = call;
705 }
706
707 static bool
tc_is_sync(struct threaded_context * tc)708 tc_is_sync(struct threaded_context *tc)
709 {
710 struct tc_batch *last = &tc->batch_slots[tc->last];
711 struct tc_batch *next = &tc->batch_slots[tc->next];
712
713 return util_queue_fence_is_signalled(&last->fence) &&
714 !next->num_total_slots;
715 }
716
717 static void
_tc_sync(struct threaded_context * tc,UNUSED const char * info,UNUSED const char * func)718 _tc_sync(struct threaded_context *tc, UNUSED const char *info, UNUSED const char *func)
719 {
720 struct tc_batch *last = &tc->batch_slots[tc->last];
721 struct tc_batch *next = &tc->batch_slots[tc->next];
722 bool synced = false;
723
724 MESA_TRACE_SCOPE(func);
725
726 tc_debug_check(tc);
727
728 if (tc->options.parse_renderpass_info && tc->in_renderpass && !tc->flushing) {
729 /* corner case: if tc syncs for any reason but a driver flush during a renderpass,
730 * then the current renderpass info MUST be signaled to avoid deadlocking the driver
731 *
732 * this is not a "complete" signal operation, however, as it's unknown what calls may
733 * come after this one, which means that framebuffer attachment data is unreliable
734 *
735 * to avoid erroneously passing bad state to the driver (e.g., allowing zsbuf elimination),
736 * force all attachments active and assume the app was going to get bad perf here anyway
737 */
738 tc_sanitize_renderpass_info(tc);
739 }
740 tc_signal_renderpass_info_ready(tc);
741
742 /* Only wait for queued calls... */
743 if (!util_queue_fence_is_signalled(&last->fence)) {
744 util_queue_fence_wait(&last->fence);
745 synced = true;
746 }
747
748 tc_debug_check(tc);
749
750 if (next->token) {
751 next->token->tc = NULL;
752 tc_unflushed_batch_token_reference(&next->token, NULL);
753 }
754
755 /* .. and execute unflushed calls directly. */
756 if (next->num_total_slots) {
757 p_atomic_add(&tc->num_direct_slots, next->num_total_slots);
758 tc->bytes_mapped_estimate = 0;
759 tc->bytes_replaced_estimate = 0;
760 tc_add_call_end(next);
761 tc_batch_execute(next, NULL, 0);
762 tc_begin_next_buffer_list(tc);
763 synced = true;
764 }
765
766 if (synced) {
767 p_atomic_inc(&tc->num_syncs);
768
769 if (tc_strcmp(func, "tc_destroy") != 0) {
770 tc_printf("sync %s %s", func, info);
771 }
772 }
773
774 tc_debug_check(tc);
775
776 if (tc->options.parse_renderpass_info) {
777 int renderpass_info_idx = next->renderpass_info_idx;
778 if (renderpass_info_idx > 0) {
779 /* don't reset if fb state is unflushed */
780 bool fb_no_draw = tc->seen_fb_state && !tc->renderpass_info_recording->has_draw;
781 uint32_t fb_info = tc->renderpass_info_recording->data32[0];
782 next->renderpass_info_idx = -1;
783 tc_batch_increment_renderpass_info(tc, tc->next, false);
784 if (fb_no_draw)
785 tc->renderpass_info_recording->data32[0] = fb_info;
786 } else if (tc->renderpass_info_recording->has_draw) {
787 tc->renderpass_info_recording->data32[0] = 0;
788 }
789 tc->seen_fb_state = false;
790 tc->query_ended = false;
791 }
792 }
793
794 #define tc_sync(tc) _tc_sync(tc, "", __func__)
795 #define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__)
796
797 /**
798 * Call this from fence_finish for same-context fence waits of deferred fences
799 * that haven't been flushed yet.
800 *
801 * The passed pipe_context must be the one passed to pipe_screen::fence_finish,
802 * i.e., the wrapped one.
803 */
804 void
threaded_context_flush(struct pipe_context * _pipe,struct tc_unflushed_batch_token * token,bool prefer_async)805 threaded_context_flush(struct pipe_context *_pipe,
806 struct tc_unflushed_batch_token *token,
807 bool prefer_async)
808 {
809 struct threaded_context *tc = threaded_context(_pipe);
810
811 /* This is called from the gallium frontend / application thread. */
812 if (token->tc && token->tc == tc) {
813 struct tc_batch *last = &tc->batch_slots[tc->last];
814
815 /* Prefer to do the flush in the driver thread if it is already
816 * running. That should be better for cache locality.
817 */
818 if (prefer_async || !util_queue_fence_is_signalled(&last->fence))
819 tc_batch_flush(tc, false);
820 else
821 tc_sync(token->tc);
822 }
823 }
824
825 static void
tc_add_to_buffer_list(struct tc_buffer_list * next,struct pipe_resource * buf)826 tc_add_to_buffer_list(struct tc_buffer_list *next, struct pipe_resource *buf)
827 {
828 uint32_t id = threaded_resource(buf)->buffer_id_unique;
829 BITSET_SET(next->buffer_list, id & TC_BUFFER_ID_MASK);
830 }
831
832 /* Reset a range of buffer binding slots. */
833 static void
tc_unbind_buffers(uint32_t * binding,unsigned count)834 tc_unbind_buffers(uint32_t *binding, unsigned count)
835 {
836 if (count)
837 memset(binding, 0, sizeof(*binding) * count);
838 }
839
840 static void
tc_add_bindings_to_buffer_list(BITSET_WORD * buffer_list,const uint32_t * bindings,unsigned count)841 tc_add_bindings_to_buffer_list(BITSET_WORD *buffer_list, const uint32_t *bindings,
842 unsigned count)
843 {
844 for (unsigned i = 0; i < count; i++) {
845 if (bindings[i])
846 BITSET_SET(buffer_list, bindings[i] & TC_BUFFER_ID_MASK);
847 }
848 }
849
850 static bool
tc_rebind_bindings(uint32_t old_id,uint32_t new_id,uint32_t * bindings,unsigned count)851 tc_rebind_bindings(uint32_t old_id, uint32_t new_id, uint32_t *bindings,
852 unsigned count)
853 {
854 unsigned rebind_count = 0;
855
856 for (unsigned i = 0; i < count; i++) {
857 if (bindings[i] == old_id) {
858 bindings[i] = new_id;
859 rebind_count++;
860 }
861 }
862 return rebind_count;
863 }
864
865 static void
tc_add_shader_bindings_to_buffer_list(struct threaded_context * tc,BITSET_WORD * buffer_list,enum pipe_shader_type shader)866 tc_add_shader_bindings_to_buffer_list(struct threaded_context *tc,
867 BITSET_WORD *buffer_list,
868 enum pipe_shader_type shader)
869 {
870 tc_add_bindings_to_buffer_list(buffer_list, tc->const_buffers[shader],
871 tc->max_const_buffers);
872 if (tc->seen_shader_buffers[shader]) {
873 tc_add_bindings_to_buffer_list(buffer_list, tc->shader_buffers[shader],
874 tc->max_shader_buffers);
875 }
876 if (tc->seen_image_buffers[shader]) {
877 tc_add_bindings_to_buffer_list(buffer_list, tc->image_buffers[shader],
878 tc->max_images);
879 }
880 if (tc->seen_sampler_buffers[shader]) {
881 tc_add_bindings_to_buffer_list(buffer_list, tc->sampler_buffers[shader],
882 tc->max_samplers);
883 }
884 }
885
886 static unsigned
tc_rebind_shader_bindings(struct threaded_context * tc,uint32_t old_id,uint32_t new_id,enum pipe_shader_type shader,uint32_t * rebind_mask)887 tc_rebind_shader_bindings(struct threaded_context *tc, uint32_t old_id,
888 uint32_t new_id, enum pipe_shader_type shader, uint32_t *rebind_mask)
889 {
890 unsigned ubo = 0, ssbo = 0, img = 0, sampler = 0;
891
892 ubo = tc_rebind_bindings(old_id, new_id, tc->const_buffers[shader],
893 tc->max_const_buffers);
894 if (ubo)
895 *rebind_mask |= BITFIELD_BIT(TC_BINDING_UBO_VS) << shader;
896 if (tc->seen_shader_buffers[shader]) {
897 ssbo = tc_rebind_bindings(old_id, new_id, tc->shader_buffers[shader],
898 tc->max_shader_buffers);
899 if (ssbo)
900 *rebind_mask |= BITFIELD_BIT(TC_BINDING_SSBO_VS) << shader;
901 }
902 if (tc->seen_image_buffers[shader]) {
903 img = tc_rebind_bindings(old_id, new_id, tc->image_buffers[shader],
904 tc->max_images);
905 if (img)
906 *rebind_mask |= BITFIELD_BIT(TC_BINDING_IMAGE_VS) << shader;
907 }
908 if (tc->seen_sampler_buffers[shader]) {
909 sampler = tc_rebind_bindings(old_id, new_id, tc->sampler_buffers[shader],
910 tc->max_samplers);
911 if (sampler)
912 *rebind_mask |= BITFIELD_BIT(TC_BINDING_SAMPLERVIEW_VS) << shader;
913 }
914 return ubo + ssbo + img + sampler;
915 }
916
917 /* Add all bound buffers used by VS/TCS/TES/GS/FS to the buffer list.
918 * This is called by the first draw call in a batch when we want to inherit
919 * all bindings set by the previous batch.
920 */
921 static void
tc_add_all_gfx_bindings_to_buffer_list(struct threaded_context * tc)922 tc_add_all_gfx_bindings_to_buffer_list(struct threaded_context *tc)
923 {
924 BITSET_WORD *buffer_list = tc->buffer_lists[tc->next_buf_list].buffer_list;
925
926 tc_add_bindings_to_buffer_list(buffer_list, tc->vertex_buffers, tc->num_vertex_buffers);
927 if (tc->seen_streamout_buffers)
928 tc_add_bindings_to_buffer_list(buffer_list, tc->streamout_buffers, PIPE_MAX_SO_BUFFERS);
929
930 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_VERTEX);
931 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_FRAGMENT);
932
933 if (tc->seen_tcs)
934 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_TESS_CTRL);
935 if (tc->seen_tes)
936 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_TESS_EVAL);
937 if (tc->seen_gs)
938 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_GEOMETRY);
939
940 tc->add_all_gfx_bindings_to_buffer_list = false;
941 }
942
943 /* Add all bound buffers used by compute to the buffer list.
944 * This is called by the first compute call in a batch when we want to inherit
945 * all bindings set by the previous batch.
946 */
947 static void
tc_add_all_compute_bindings_to_buffer_list(struct threaded_context * tc)948 tc_add_all_compute_bindings_to_buffer_list(struct threaded_context *tc)
949 {
950 BITSET_WORD *buffer_list = tc->buffer_lists[tc->next_buf_list].buffer_list;
951
952 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_COMPUTE);
953 tc->add_all_compute_bindings_to_buffer_list = false;
954 }
955
956 static unsigned
tc_rebind_buffer(struct threaded_context * tc,uint32_t old_id,uint32_t new_id,uint32_t * rebind_mask)957 tc_rebind_buffer(struct threaded_context *tc, uint32_t old_id, uint32_t new_id, uint32_t *rebind_mask)
958 {
959 unsigned vbo = 0, so = 0;
960
961 vbo = tc_rebind_bindings(old_id, new_id, tc->vertex_buffers,
962 tc->num_vertex_buffers);
963 if (vbo)
964 *rebind_mask |= BITFIELD_BIT(TC_BINDING_VERTEX_BUFFER);
965
966 if (tc->seen_streamout_buffers) {
967 so = tc_rebind_bindings(old_id, new_id, tc->streamout_buffers,
968 PIPE_MAX_SO_BUFFERS);
969 if (so)
970 *rebind_mask |= BITFIELD_BIT(TC_BINDING_STREAMOUT_BUFFER);
971 }
972 unsigned rebound = vbo + so;
973
974 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_VERTEX, rebind_mask);
975 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_FRAGMENT, rebind_mask);
976
977 if (tc->seen_tcs)
978 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_TESS_CTRL, rebind_mask);
979 if (tc->seen_tes)
980 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_TESS_EVAL, rebind_mask);
981 if (tc->seen_gs)
982 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_GEOMETRY, rebind_mask);
983
984 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_COMPUTE, rebind_mask);
985
986 if (rebound)
987 BITSET_SET(tc->buffer_lists[tc->next_buf_list].buffer_list, new_id & TC_BUFFER_ID_MASK);
988 return rebound;
989 }
990
991 static bool
tc_is_buffer_bound_with_mask(uint32_t id,uint32_t * bindings,unsigned binding_mask)992 tc_is_buffer_bound_with_mask(uint32_t id, uint32_t *bindings, unsigned binding_mask)
993 {
994 while (binding_mask) {
995 if (bindings[u_bit_scan(&binding_mask)] == id)
996 return true;
997 }
998 return false;
999 }
1000
1001 static bool
tc_is_buffer_shader_bound_for_write(struct threaded_context * tc,uint32_t id,enum pipe_shader_type shader)1002 tc_is_buffer_shader_bound_for_write(struct threaded_context *tc, uint32_t id,
1003 enum pipe_shader_type shader)
1004 {
1005 if (tc->seen_shader_buffers[shader] &&
1006 tc_is_buffer_bound_with_mask(id, tc->shader_buffers[shader],
1007 tc->shader_buffers_writeable_mask[shader]))
1008 return true;
1009
1010 if (tc->seen_image_buffers[shader] &&
1011 tc_is_buffer_bound_with_mask(id, tc->image_buffers[shader],
1012 tc->image_buffers_writeable_mask[shader]))
1013 return true;
1014
1015 return false;
1016 }
1017
1018 static bool
tc_is_buffer_bound_for_write(struct threaded_context * tc,uint32_t id)1019 tc_is_buffer_bound_for_write(struct threaded_context *tc, uint32_t id)
1020 {
1021 if (tc->seen_streamout_buffers &&
1022 tc_is_buffer_bound_with_mask(id, tc->streamout_buffers,
1023 BITFIELD_MASK(PIPE_MAX_SO_BUFFERS)))
1024 return true;
1025
1026 if (tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_VERTEX) ||
1027 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_FRAGMENT) ||
1028 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_COMPUTE))
1029 return true;
1030
1031 if (tc->seen_tcs &&
1032 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_TESS_CTRL))
1033 return true;
1034
1035 if (tc->seen_tes &&
1036 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_TESS_EVAL))
1037 return true;
1038
1039 if (tc->seen_gs &&
1040 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_GEOMETRY))
1041 return true;
1042
1043 return false;
1044 }
1045
1046 static bool
tc_is_buffer_busy(struct threaded_context * tc,struct threaded_resource * tbuf,unsigned map_usage)1047 tc_is_buffer_busy(struct threaded_context *tc, struct threaded_resource *tbuf,
1048 unsigned map_usage)
1049 {
1050 if (!tc->options.is_resource_busy)
1051 return true;
1052
1053 uint32_t id_hash = tbuf->buffer_id_unique & TC_BUFFER_ID_MASK;
1054
1055 for (unsigned i = 0; i < TC_MAX_BUFFER_LISTS; i++) {
1056 struct tc_buffer_list *buf_list = &tc->buffer_lists[i];
1057
1058 /* If the buffer is referenced by a batch that hasn't been flushed (by tc or the driver),
1059 * then the buffer is considered busy. */
1060 if (!util_queue_fence_is_signalled(&buf_list->driver_flushed_fence) &&
1061 BITSET_TEST(buf_list->buffer_list, id_hash))
1062 return true;
1063 }
1064
1065 /* The buffer isn't referenced by any unflushed batch: we can safely ask to the driver whether
1066 * this buffer is busy or not. */
1067 return tc->options.is_resource_busy(tc->pipe->screen, tbuf->latest, map_usage);
1068 }
1069
1070 /**
1071 * allow_cpu_storage should be false for user memory and imported buffers.
1072 */
1073 void
threaded_resource_init(struct pipe_resource * res,bool allow_cpu_storage)1074 threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage)
1075 {
1076 struct threaded_resource *tres = threaded_resource(res);
1077
1078 tres->latest = &tres->b;
1079 tres->cpu_storage = NULL;
1080 util_range_init(&tres->valid_buffer_range);
1081 tres->is_shared = false;
1082 tres->is_user_ptr = false;
1083 tres->buffer_id_unique = 0;
1084 tres->pending_staging_uploads = 0;
1085 tres->last_batch_usage = -1;
1086 util_range_init(&tres->pending_staging_uploads_range);
1087
1088 if (allow_cpu_storage &&
1089 !(res->flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
1090 PIPE_RESOURCE_FLAG_SPARSE |
1091 PIPE_RESOURCE_FLAG_ENCRYPTED)) &&
1092 /* We need buffer invalidation and buffer busyness tracking for the CPU
1093 * storage, which aren't supported with pipe_vertex_state. */
1094 !(res->bind & PIPE_BIND_VERTEX_STATE))
1095 tres->allow_cpu_storage = true;
1096 else
1097 tres->allow_cpu_storage = false;
1098 }
1099
1100 void
threaded_resource_deinit(struct pipe_resource * res)1101 threaded_resource_deinit(struct pipe_resource *res)
1102 {
1103 struct threaded_resource *tres = threaded_resource(res);
1104
1105 if (tres->latest != &tres->b)
1106 pipe_resource_reference(&tres->latest, NULL);
1107 util_range_destroy(&tres->valid_buffer_range);
1108 util_range_destroy(&tres->pending_staging_uploads_range);
1109 align_free(tres->cpu_storage);
1110 }
1111
1112 struct pipe_context *
threaded_context_unwrap_sync(struct pipe_context * pipe)1113 threaded_context_unwrap_sync(struct pipe_context *pipe)
1114 {
1115 if (!pipe || !pipe->priv)
1116 return pipe;
1117
1118 tc_sync(threaded_context(pipe));
1119 return (struct pipe_context*)pipe->priv;
1120 }
1121
1122
1123 /********************************************************************
1124 * simple functions
1125 */
1126
1127 #define TC_FUNC1(func, qualifier, type, deref, addr, ...) \
1128 struct tc_call_##func { \
1129 struct tc_call_base base; \
1130 type state; \
1131 }; \
1132 \
1133 static uint16_t \
1134 tc_call_##func(struct pipe_context *pipe, void *call) \
1135 { \
1136 pipe->func(pipe, addr(to_call(call, tc_call_##func)->state)); \
1137 return call_size(tc_call_##func); \
1138 } \
1139 \
1140 static void \
1141 tc_##func(struct pipe_context *_pipe, qualifier type deref param) \
1142 { \
1143 struct threaded_context *tc = threaded_context(_pipe); \
1144 struct tc_call_##func *p = (struct tc_call_##func*) \
1145 tc_add_call(tc, TC_CALL_##func, tc_call_##func); \
1146 p->state = deref(param); \
1147 __VA_ARGS__; \
1148 }
1149
1150 TC_FUNC1(set_active_query_state, , bool, , )
1151
1152 TC_FUNC1(set_blend_color, const, struct pipe_blend_color, *, &)
1153 TC_FUNC1(set_stencil_ref, const, struct pipe_stencil_ref, , )
1154 TC_FUNC1(set_clip_state, const, struct pipe_clip_state, *, &)
1155 TC_FUNC1(set_sample_mask, , unsigned, , )
1156 TC_FUNC1(set_min_samples, , unsigned, , )
1157 TC_FUNC1(set_polygon_stipple, const, struct pipe_poly_stipple, *, &)
1158
1159 TC_FUNC1(texture_barrier, , unsigned, , )
1160 TC_FUNC1(memory_barrier, , unsigned, , )
1161 TC_FUNC1(delete_texture_handle, , uint64_t, , )
1162 TC_FUNC1(delete_image_handle, , uint64_t, , )
1163 TC_FUNC1(set_frontend_noop, , bool, , )
1164
1165
1166 /********************************************************************
1167 * queries
1168 */
1169
1170 static struct pipe_query *
tc_create_query(struct pipe_context * _pipe,unsigned query_type,unsigned index)1171 tc_create_query(struct pipe_context *_pipe, unsigned query_type,
1172 unsigned index)
1173 {
1174 struct threaded_context *tc = threaded_context(_pipe);
1175 struct pipe_context *pipe = tc->pipe;
1176
1177 return pipe->create_query(pipe, query_type, index);
1178 }
1179
1180 static struct pipe_query *
tc_create_batch_query(struct pipe_context * _pipe,unsigned num_queries,unsigned * query_types)1181 tc_create_batch_query(struct pipe_context *_pipe, unsigned num_queries,
1182 unsigned *query_types)
1183 {
1184 struct threaded_context *tc = threaded_context(_pipe);
1185 struct pipe_context *pipe = tc->pipe;
1186
1187 return pipe->create_batch_query(pipe, num_queries, query_types);
1188 }
1189
1190 struct tc_query_call {
1191 struct tc_call_base base;
1192 struct pipe_query *query;
1193 };
1194
1195 static uint16_t
tc_call_destroy_query(struct pipe_context * pipe,void * call)1196 tc_call_destroy_query(struct pipe_context *pipe, void *call)
1197 {
1198 struct pipe_query *query = to_call(call, tc_query_call)->query;
1199 struct threaded_query *tq = threaded_query(query);
1200
1201 if (list_is_linked(&tq->head_unflushed))
1202 list_del(&tq->head_unflushed);
1203
1204 pipe->destroy_query(pipe, query);
1205 return call_size(tc_query_call);
1206 }
1207
1208 static void
tc_destroy_query(struct pipe_context * _pipe,struct pipe_query * query)1209 tc_destroy_query(struct pipe_context *_pipe, struct pipe_query *query)
1210 {
1211 struct threaded_context *tc = threaded_context(_pipe);
1212
1213 tc_add_call(tc, TC_CALL_destroy_query, tc_query_call)->query = query;
1214 }
1215
1216 static uint16_t
tc_call_begin_query(struct pipe_context * pipe,void * call)1217 tc_call_begin_query(struct pipe_context *pipe, void *call)
1218 {
1219 pipe->begin_query(pipe, to_call(call, tc_query_call)->query);
1220 return call_size(tc_query_call);
1221 }
1222
1223 static bool
tc_begin_query(struct pipe_context * _pipe,struct pipe_query * query)1224 tc_begin_query(struct pipe_context *_pipe, struct pipe_query *query)
1225 {
1226 struct threaded_context *tc = threaded_context(_pipe);
1227 tc->num_queries_active++;
1228
1229 tc_add_call(tc, TC_CALL_begin_query, tc_query_call)->query = query;
1230 return true; /* we don't care about the return value for this call */
1231 }
1232
1233 struct tc_end_query_call {
1234 struct tc_call_base base;
1235 struct threaded_context *tc;
1236 struct pipe_query *query;
1237 };
1238
1239 static uint16_t
tc_call_end_query(struct pipe_context * pipe,void * call)1240 tc_call_end_query(struct pipe_context *pipe, void *call)
1241 {
1242 struct tc_end_query_call *p = to_call(call, tc_end_query_call);
1243 struct threaded_query *tq = threaded_query(p->query);
1244
1245 if (!list_is_linked(&tq->head_unflushed))
1246 list_add(&tq->head_unflushed, &p->tc->unflushed_queries);
1247
1248 pipe->end_query(pipe, p->query);
1249 return call_size(tc_end_query_call);
1250 }
1251
1252 static bool
tc_end_query(struct pipe_context * _pipe,struct pipe_query * query)1253 tc_end_query(struct pipe_context *_pipe, struct pipe_query *query)
1254 {
1255 struct threaded_context *tc = threaded_context(_pipe);
1256 struct threaded_query *tq = threaded_query(query);
1257 struct tc_end_query_call *call =
1258 tc_add_call(tc, TC_CALL_end_query, tc_end_query_call);
1259 tc->num_queries_active--;
1260
1261 call->tc = tc;
1262 call->query = query;
1263
1264 tq->flushed = false;
1265 tc->query_ended = true;
1266
1267 return true; /* we don't care about the return value for this call */
1268 }
1269
1270 static bool
tc_get_query_result(struct pipe_context * _pipe,struct pipe_query * query,bool wait,union pipe_query_result * result)1271 tc_get_query_result(struct pipe_context *_pipe,
1272 struct pipe_query *query, bool wait,
1273 union pipe_query_result *result)
1274 {
1275 struct threaded_context *tc = threaded_context(_pipe);
1276 struct threaded_query *tq = threaded_query(query);
1277 struct pipe_context *pipe = tc->pipe;
1278 bool flushed = tq->flushed;
1279
1280 if (!flushed) {
1281 tc_sync_msg(tc, wait ? "wait" : "nowait");
1282 tc_set_driver_thread(tc);
1283 }
1284
1285 bool success = pipe->get_query_result(pipe, query, wait, result);
1286
1287 if (!flushed)
1288 tc_clear_driver_thread(tc);
1289
1290 if (success) {
1291 tq->flushed = true;
1292 if (list_is_linked(&tq->head_unflushed)) {
1293 /* This is safe because it can only happen after we sync'd. */
1294 list_del(&tq->head_unflushed);
1295 }
1296 }
1297 return success;
1298 }
1299
1300 struct tc_query_result_resource {
1301 struct tc_call_base base;
1302 enum pipe_query_flags flags:8;
1303 enum pipe_query_value_type result_type:8;
1304 int8_t index; /* it can be -1 */
1305 unsigned offset;
1306 struct pipe_query *query;
1307 struct pipe_resource *resource;
1308 };
1309
1310 static uint16_t
tc_call_get_query_result_resource(struct pipe_context * pipe,void * call)1311 tc_call_get_query_result_resource(struct pipe_context *pipe, void *call)
1312 {
1313 struct tc_query_result_resource *p = to_call(call, tc_query_result_resource);
1314
1315 pipe->get_query_result_resource(pipe, p->query, p->flags, p->result_type,
1316 p->index, p->resource, p->offset);
1317 tc_drop_resource_reference(p->resource);
1318 return call_size(tc_query_result_resource);
1319 }
1320
1321 static void
tc_get_query_result_resource(struct pipe_context * _pipe,struct pipe_query * query,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1322 tc_get_query_result_resource(struct pipe_context *_pipe,
1323 struct pipe_query *query,
1324 enum pipe_query_flags flags,
1325 enum pipe_query_value_type result_type, int index,
1326 struct pipe_resource *resource, unsigned offset)
1327 {
1328 struct threaded_context *tc = threaded_context(_pipe);
1329
1330 tc_buffer_disable_cpu_storage(resource);
1331
1332 struct tc_query_result_resource *p =
1333 tc_add_call(tc, TC_CALL_get_query_result_resource,
1334 tc_query_result_resource);
1335 p->query = query;
1336 p->flags = flags;
1337 p->result_type = result_type;
1338 p->index = index;
1339 tc_set_resource_reference(&p->resource, resource);
1340 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], resource);
1341 p->offset = offset;
1342 }
1343
1344 struct tc_render_condition {
1345 struct tc_call_base base;
1346 bool condition;
1347 unsigned mode;
1348 struct pipe_query *query;
1349 };
1350
1351 static uint16_t
tc_call_render_condition(struct pipe_context * pipe,void * call)1352 tc_call_render_condition(struct pipe_context *pipe, void *call)
1353 {
1354 struct tc_render_condition *p = to_call(call, tc_render_condition);
1355 pipe->render_condition(pipe, p->query, p->condition, p->mode);
1356 return call_size(tc_render_condition);
1357 }
1358
1359 static void
tc_render_condition(struct pipe_context * _pipe,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1360 tc_render_condition(struct pipe_context *_pipe,
1361 struct pipe_query *query, bool condition,
1362 enum pipe_render_cond_flag mode)
1363 {
1364 struct threaded_context *tc = threaded_context(_pipe);
1365 struct tc_render_condition *p =
1366 tc_add_call(tc, TC_CALL_render_condition, tc_render_condition);
1367
1368 p->query = query;
1369 p->condition = condition;
1370 p->mode = mode;
1371 }
1372
1373
1374 /********************************************************************
1375 * constant (immutable) states
1376 */
1377
1378 #define TC_CSO_CREATE(name, sname) \
1379 static void * \
1380 tc_create_##name##_state(struct pipe_context *_pipe, \
1381 const struct pipe_##sname##_state *state) \
1382 { \
1383 struct pipe_context *pipe = threaded_context(_pipe)->pipe; \
1384 return pipe->create_##name##_state(pipe, state); \
1385 }
1386
1387 #define TC_CSO_BIND(name, ...) TC_FUNC1(bind_##name##_state, , void *, , , ##__VA_ARGS__)
1388 #define TC_CSO_DELETE(name) TC_FUNC1(delete_##name##_state, , void *, , )
1389
1390 #define TC_CSO(name, sname, ...) \
1391 TC_CSO_CREATE(name, sname) \
1392 TC_CSO_BIND(name, ##__VA_ARGS__) \
1393 TC_CSO_DELETE(name)
1394
1395 #define TC_CSO_WHOLE(name) TC_CSO(name, name)
1396 #define TC_CSO_SHADER(name) TC_CSO(name, shader)
1397 #define TC_CSO_SHADER_TRACK(name) TC_CSO(name, shader, tc->seen_##name = true;)
1398
1399 TC_CSO_WHOLE(blend)
TC_CSO_WHOLE(rasterizer)1400 TC_CSO_WHOLE(rasterizer)
1401 TC_CSO_CREATE(depth_stencil_alpha, depth_stencil_alpha)
1402 TC_CSO_BIND(depth_stencil_alpha,
1403 if (param && tc->options.parse_renderpass_info) {
1404 /* dsa info is only ever added during a renderpass;
1405 * changes outside of a renderpass reset the data
1406 */
1407 if (!tc->in_renderpass) {
1408 tc_get_renderpass_info(tc)->zsbuf_write_dsa = 0;
1409 tc_get_renderpass_info(tc)->zsbuf_read_dsa = 0;
1410 }
1411 /* let the driver parse its own state */
1412 tc->options.dsa_parse(param, tc_get_renderpass_info(tc));
1413 }
1414 )
1415 TC_CSO_DELETE(depth_stencil_alpha)
1416 TC_CSO_WHOLE(compute)
1417 TC_CSO_CREATE(fs, shader)
1418 TC_CSO_BIND(fs,
1419 if (param && tc->options.parse_renderpass_info) {
1420 /* fs info is only ever added during a renderpass;
1421 * changes outside of a renderpass reset the data
1422 */
1423 if (!tc->in_renderpass) {
1424 tc_get_renderpass_info(tc)->cbuf_fbfetch = 0;
1425 tc_get_renderpass_info(tc)->zsbuf_write_fs = 0;
1426 }
1427 /* let the driver parse its own state */
1428 tc->options.fs_parse(param, tc_get_renderpass_info(tc));
1429 }
1430 )
1431 TC_CSO_DELETE(fs)
1432 TC_CSO_SHADER(vs)
1433 TC_CSO_SHADER_TRACK(gs)
1434 TC_CSO_SHADER_TRACK(tcs)
1435 TC_CSO_SHADER_TRACK(tes)
1436 TC_CSO_CREATE(sampler, sampler)
1437 TC_CSO_DELETE(sampler)
1438 TC_CSO_BIND(vertex_elements)
1439 TC_CSO_DELETE(vertex_elements)
1440
1441 static void *
1442 tc_create_vertex_elements_state(struct pipe_context *_pipe, unsigned count,
1443 const struct pipe_vertex_element *elems)
1444 {
1445 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
1446
1447 return pipe->create_vertex_elements_state(pipe, count, elems);
1448 }
1449
1450 struct tc_sampler_states {
1451 struct tc_call_base base;
1452 uint8_t shader, start, count;
1453 void *slot[0]; /* more will be allocated if needed */
1454 };
1455
1456 static uint16_t
tc_call_bind_sampler_states(struct pipe_context * pipe,void * call)1457 tc_call_bind_sampler_states(struct pipe_context *pipe, void *call)
1458 {
1459 struct tc_sampler_states *p = (struct tc_sampler_states *)call;
1460
1461 pipe->bind_sampler_states(pipe, p->shader, p->start, p->count, p->slot);
1462 return p->base.num_slots;
1463 }
1464
1465 static void
tc_bind_sampler_states(struct pipe_context * _pipe,enum pipe_shader_type shader,unsigned start,unsigned count,void ** states)1466 tc_bind_sampler_states(struct pipe_context *_pipe,
1467 enum pipe_shader_type shader,
1468 unsigned start, unsigned count, void **states)
1469 {
1470 if (!count)
1471 return;
1472
1473 struct threaded_context *tc = threaded_context(_pipe);
1474 struct tc_sampler_states *p =
1475 tc_add_slot_based_call(tc, TC_CALL_bind_sampler_states, tc_sampler_states, count);
1476
1477 p->shader = shader;
1478 p->start = start;
1479 p->count = count;
1480 memcpy(p->slot, states, count * sizeof(states[0]));
1481 }
1482
1483 static void
tc_link_shader(struct pipe_context * _pipe,void ** shaders)1484 tc_link_shader(struct pipe_context *_pipe, void **shaders)
1485 {
1486 struct threaded_context *tc = threaded_context(_pipe);
1487 tc->pipe->link_shader(tc->pipe, shaders);
1488 }
1489 /********************************************************************
1490 * immediate states
1491 */
1492
1493 struct tc_framebuffer {
1494 struct tc_call_base base;
1495 struct pipe_framebuffer_state state;
1496 };
1497
1498 static uint16_t
tc_call_set_framebuffer_state(struct pipe_context * pipe,void * call)1499 tc_call_set_framebuffer_state(struct pipe_context *pipe, void *call)
1500 {
1501 struct pipe_framebuffer_state *p = &to_call(call, tc_framebuffer)->state;
1502
1503 pipe->set_framebuffer_state(pipe, p);
1504
1505 unsigned nr_cbufs = p->nr_cbufs;
1506 for (unsigned i = 0; i < nr_cbufs; i++)
1507 tc_drop_surface_reference(p->cbufs[i]);
1508 tc_drop_surface_reference(p->zsbuf);
1509 tc_drop_resource_reference(p->resolve);
1510 return call_size(tc_framebuffer);
1511 }
1512
1513 static void
tc_set_framebuffer_state(struct pipe_context * _pipe,const struct pipe_framebuffer_state * fb)1514 tc_set_framebuffer_state(struct pipe_context *_pipe,
1515 const struct pipe_framebuffer_state *fb)
1516 {
1517 struct threaded_context *tc = threaded_context(_pipe);
1518 struct tc_framebuffer *p =
1519 tc_add_call(tc, TC_CALL_set_framebuffer_state, tc_framebuffer);
1520 unsigned nr_cbufs = fb->nr_cbufs;
1521
1522 p->state.width = fb->width;
1523 p->state.height = fb->height;
1524 p->state.samples = fb->samples;
1525 p->state.layers = fb->layers;
1526 p->state.nr_cbufs = nr_cbufs;
1527 p->state.viewmask = fb->viewmask;
1528
1529 /* when unbinding, mark attachments as used for the current batch */
1530 for (unsigned i = 0; i < tc->nr_cbufs; i++) {
1531 tc_set_resource_batch_usage_persistent(tc, tc->fb_resources[i], false);
1532 pipe_resource_reference(&tc->fb_resources[i], NULL);
1533 }
1534 tc_set_resource_batch_usage_persistent(tc, tc->fb_resources[PIPE_MAX_COLOR_BUFS], false);
1535 tc_set_resource_batch_usage_persistent(tc, tc->fb_resolve, false);
1536
1537 for (unsigned i = 0; i < nr_cbufs; i++) {
1538 p->state.cbufs[i] = NULL;
1539 pipe_surface_reference(&p->state.cbufs[i], fb->cbufs[i]);
1540 /* full tracking requires storing the fb attachment resources */
1541 if (fb->cbufs[i])
1542 pipe_resource_reference(&tc->fb_resources[i], fb->cbufs[i]->texture);
1543 tc_set_resource_batch_usage_persistent(tc, tc->fb_resources[i], true);
1544 }
1545 tc->nr_cbufs = nr_cbufs;
1546 if (tc->options.parse_renderpass_info) {
1547 /* ensure this is treated as the first fb set if no fb activity has occurred */
1548 if (!tc->renderpass_info_recording->has_draw &&
1549 !tc->renderpass_info_recording->cbuf_clear &&
1550 !tc->renderpass_info_recording->cbuf_load &&
1551 !tc->renderpass_info_recording->zsbuf_load &&
1552 !tc->renderpass_info_recording->zsbuf_clear_partial)
1553 tc->batch_slots[tc->next].first_set_fb = false;
1554 /* store existing zsbuf data for possible persistence */
1555 uint8_t zsbuf = tc->renderpass_info_recording->has_draw ?
1556 0 :
1557 tc->renderpass_info_recording->data8[3];
1558 bool zsbuf_changed = tc->fb_resources[PIPE_MAX_COLOR_BUFS] !=
1559 (fb->zsbuf ? fb->zsbuf->texture : NULL);
1560
1561 if (tc->seen_fb_state) {
1562 /* this is the end of a renderpass, so increment the renderpass info */
1563 tc_batch_increment_renderpass_info(tc, tc->next, false);
1564 /* if zsbuf hasn't changed (i.e., possibly just adding a color buffer):
1565 * keep zsbuf usage data
1566 */
1567 if (!zsbuf_changed)
1568 tc->renderpass_info_recording->data8[3] = zsbuf;
1569 } else {
1570 /* this is the first time a set_framebuffer_call is triggered;
1571 * just increment the index and keep using the existing info for recording
1572 */
1573 tc->batch_slots[tc->next].renderpass_info_idx = 0;
1574 }
1575 /* future fb state changes will increment the index */
1576 tc->seen_fb_state = true;
1577 }
1578 pipe_resource_reference(&tc->fb_resources[PIPE_MAX_COLOR_BUFS],
1579 fb->zsbuf ? fb->zsbuf->texture : NULL);
1580 pipe_resource_reference(&tc->fb_resolve, fb->resolve);
1581 tc_set_resource_batch_usage_persistent(tc, tc->fb_resources[PIPE_MAX_COLOR_BUFS], true);
1582 tc_set_resource_batch_usage_persistent(tc, tc->fb_resolve, true);
1583 tc->in_renderpass = false;
1584 p->state.zsbuf = NULL;
1585 pipe_surface_reference(&p->state.zsbuf, fb->zsbuf);
1586 p->state.resolve = NULL;
1587 pipe_resource_reference(&p->state.resolve, fb->resolve);
1588 }
1589
1590 struct tc_tess_state {
1591 struct tc_call_base base;
1592 float state[6];
1593 };
1594
1595 static uint16_t
tc_call_set_tess_state(struct pipe_context * pipe,void * call)1596 tc_call_set_tess_state(struct pipe_context *pipe, void *call)
1597 {
1598 float *p = to_call(call, tc_tess_state)->state;
1599
1600 pipe->set_tess_state(pipe, p, p + 4);
1601 return call_size(tc_tess_state);
1602 }
1603
1604 static void
tc_set_tess_state(struct pipe_context * _pipe,const float default_outer_level[4],const float default_inner_level[2])1605 tc_set_tess_state(struct pipe_context *_pipe,
1606 const float default_outer_level[4],
1607 const float default_inner_level[2])
1608 {
1609 struct threaded_context *tc = threaded_context(_pipe);
1610 float *p = tc_add_call(tc, TC_CALL_set_tess_state, tc_tess_state)->state;
1611
1612 memcpy(p, default_outer_level, 4 * sizeof(float));
1613 memcpy(p + 4, default_inner_level, 2 * sizeof(float));
1614 }
1615
1616 struct tc_patch_vertices {
1617 struct tc_call_base base;
1618 uint8_t patch_vertices;
1619 };
1620
1621 static uint16_t
tc_call_set_patch_vertices(struct pipe_context * pipe,void * call)1622 tc_call_set_patch_vertices(struct pipe_context *pipe, void *call)
1623 {
1624 uint8_t patch_vertices = to_call(call, tc_patch_vertices)->patch_vertices;
1625
1626 pipe->set_patch_vertices(pipe, patch_vertices);
1627 return call_size(tc_patch_vertices);
1628 }
1629
1630 static void
tc_set_patch_vertices(struct pipe_context * _pipe,uint8_t patch_vertices)1631 tc_set_patch_vertices(struct pipe_context *_pipe, uint8_t patch_vertices)
1632 {
1633 struct threaded_context *tc = threaded_context(_pipe);
1634
1635 tc_add_call(tc, TC_CALL_set_patch_vertices,
1636 tc_patch_vertices)->patch_vertices = patch_vertices;
1637 }
1638
1639 struct tc_constant_buffer_base {
1640 struct tc_call_base base;
1641 uint8_t shader, index;
1642 bool is_null;
1643 };
1644
1645 struct tc_constant_buffer {
1646 struct tc_constant_buffer_base base;
1647 struct pipe_constant_buffer cb;
1648 };
1649
1650 static uint16_t
tc_call_set_constant_buffer(struct pipe_context * pipe,void * call)1651 tc_call_set_constant_buffer(struct pipe_context *pipe, void *call)
1652 {
1653 struct tc_constant_buffer *p = (struct tc_constant_buffer *)call;
1654
1655 if (unlikely(p->base.is_null)) {
1656 pipe->set_constant_buffer(pipe, p->base.shader, p->base.index, false, NULL);
1657 return call_size(tc_constant_buffer_base);
1658 }
1659
1660 pipe->set_constant_buffer(pipe, p->base.shader, p->base.index, true, &p->cb);
1661 return call_size(tc_constant_buffer);
1662 }
1663
1664 static void
tc_set_constant_buffer(struct pipe_context * _pipe,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)1665 tc_set_constant_buffer(struct pipe_context *_pipe,
1666 enum pipe_shader_type shader, uint index,
1667 bool take_ownership,
1668 const struct pipe_constant_buffer *cb)
1669 {
1670 struct threaded_context *tc = threaded_context(_pipe);
1671
1672 if (unlikely(!cb || (!cb->buffer && !cb->user_buffer))) {
1673 struct tc_constant_buffer_base *p =
1674 tc_add_call(tc, TC_CALL_set_constant_buffer, tc_constant_buffer_base);
1675 p->shader = shader;
1676 p->index = index;
1677 p->is_null = true;
1678 tc_unbind_buffer(&tc->const_buffers[shader][index]);
1679 return;
1680 }
1681
1682 struct pipe_resource *buffer;
1683 unsigned offset;
1684
1685 if (cb->user_buffer) {
1686 /* This must be done before adding set_constant_buffer, because it could
1687 * generate e.g. transfer_unmap and flush partially-uninitialized
1688 * set_constant_buffer to the driver if it was done afterwards.
1689 */
1690 buffer = NULL;
1691 u_upload_data(tc->base.const_uploader, 0, cb->buffer_size,
1692 tc->ubo_alignment, cb->user_buffer, &offset, &buffer);
1693 u_upload_unmap(tc->base.const_uploader);
1694 take_ownership = true;
1695 } else {
1696 buffer = cb->buffer;
1697 offset = cb->buffer_offset;
1698 }
1699
1700 struct tc_constant_buffer *p =
1701 tc_add_call(tc, TC_CALL_set_constant_buffer, tc_constant_buffer);
1702 p->base.shader = shader;
1703 p->base.index = index;
1704 p->base.is_null = false;
1705 p->cb.user_buffer = NULL;
1706 p->cb.buffer_offset = offset;
1707 p->cb.buffer_size = cb->buffer_size;
1708
1709 if (take_ownership)
1710 p->cb.buffer = buffer;
1711 else
1712 tc_set_resource_reference(&p->cb.buffer, buffer);
1713
1714 if (buffer) {
1715 tc_bind_buffer(&tc->const_buffers[shader][index],
1716 &tc->buffer_lists[tc->next_buf_list], buffer);
1717 } else {
1718 tc_unbind_buffer(&tc->const_buffers[shader][index]);
1719 }
1720 }
1721
1722 struct tc_inlinable_constants {
1723 struct tc_call_base base;
1724 uint8_t shader;
1725 uint8_t num_values;
1726 uint32_t values[MAX_INLINABLE_UNIFORMS];
1727 };
1728
1729 static uint16_t
tc_call_set_inlinable_constants(struct pipe_context * pipe,void * call)1730 tc_call_set_inlinable_constants(struct pipe_context *pipe, void *call)
1731 {
1732 struct tc_inlinable_constants *p = to_call(call, tc_inlinable_constants);
1733
1734 pipe->set_inlinable_constants(pipe, p->shader, p->num_values, p->values);
1735 return call_size(tc_inlinable_constants);
1736 }
1737
1738 static void
tc_set_inlinable_constants(struct pipe_context * _pipe,enum pipe_shader_type shader,uint num_values,uint32_t * values)1739 tc_set_inlinable_constants(struct pipe_context *_pipe,
1740 enum pipe_shader_type shader,
1741 uint num_values, uint32_t *values)
1742 {
1743 struct threaded_context *tc = threaded_context(_pipe);
1744 struct tc_inlinable_constants *p =
1745 tc_add_call(tc, TC_CALL_set_inlinable_constants, tc_inlinable_constants);
1746 p->shader = shader;
1747 p->num_values = num_values;
1748 memcpy(p->values, values, num_values * 4);
1749 }
1750
1751 struct tc_sample_locations {
1752 struct tc_call_base base;
1753 uint16_t size;
1754 uint8_t slot[0];
1755 };
1756
1757
1758 static uint16_t
tc_call_set_sample_locations(struct pipe_context * pipe,void * call)1759 tc_call_set_sample_locations(struct pipe_context *pipe, void *call)
1760 {
1761 struct tc_sample_locations *p = (struct tc_sample_locations *)call;
1762
1763 pipe->set_sample_locations(pipe, p->size, p->slot);
1764 return p->base.num_slots;
1765 }
1766
1767 static void
tc_set_sample_locations(struct pipe_context * _pipe,size_t size,const uint8_t * locations)1768 tc_set_sample_locations(struct pipe_context *_pipe, size_t size, const uint8_t *locations)
1769 {
1770 struct threaded_context *tc = threaded_context(_pipe);
1771 struct tc_sample_locations *p =
1772 tc_add_slot_based_call(tc, TC_CALL_set_sample_locations,
1773 tc_sample_locations, size);
1774
1775 p->size = size;
1776 memcpy(p->slot, locations, size);
1777 }
1778
1779 struct tc_scissors {
1780 struct tc_call_base base;
1781 uint8_t start, count;
1782 struct pipe_scissor_state slot[0]; /* more will be allocated if needed */
1783 };
1784
1785 static uint16_t
tc_call_set_scissor_states(struct pipe_context * pipe,void * call)1786 tc_call_set_scissor_states(struct pipe_context *pipe, void *call)
1787 {
1788 struct tc_scissors *p = (struct tc_scissors *)call;
1789
1790 pipe->set_scissor_states(pipe, p->start, p->count, p->slot);
1791 return p->base.num_slots;
1792 }
1793
1794 static void
tc_set_scissor_states(struct pipe_context * _pipe,unsigned start,unsigned count,const struct pipe_scissor_state * states)1795 tc_set_scissor_states(struct pipe_context *_pipe,
1796 unsigned start, unsigned count,
1797 const struct pipe_scissor_state *states)
1798 {
1799 struct threaded_context *tc = threaded_context(_pipe);
1800 struct tc_scissors *p =
1801 tc_add_slot_based_call(tc, TC_CALL_set_scissor_states, tc_scissors, count);
1802
1803 p->start = start;
1804 p->count = count;
1805 memcpy(&p->slot, states, count * sizeof(states[0]));
1806 }
1807
1808 struct tc_viewports {
1809 struct tc_call_base base;
1810 uint8_t start, count;
1811 struct pipe_viewport_state slot[0]; /* more will be allocated if needed */
1812 };
1813
1814 static uint16_t
tc_call_set_viewport_states(struct pipe_context * pipe,void * call)1815 tc_call_set_viewport_states(struct pipe_context *pipe, void *call)
1816 {
1817 struct tc_viewports *p = (struct tc_viewports *)call;
1818
1819 pipe->set_viewport_states(pipe, p->start, p->count, p->slot);
1820 return p->base.num_slots;
1821 }
1822
1823 static void
tc_set_viewport_states(struct pipe_context * _pipe,unsigned start,unsigned count,const struct pipe_viewport_state * states)1824 tc_set_viewport_states(struct pipe_context *_pipe,
1825 unsigned start, unsigned count,
1826 const struct pipe_viewport_state *states)
1827 {
1828 if (!count)
1829 return;
1830
1831 struct threaded_context *tc = threaded_context(_pipe);
1832 struct tc_viewports *p =
1833 tc_add_slot_based_call(tc, TC_CALL_set_viewport_states, tc_viewports, count);
1834
1835 p->start = start;
1836 p->count = count;
1837 memcpy(&p->slot, states, count * sizeof(states[0]));
1838 }
1839
1840 struct tc_window_rects {
1841 struct tc_call_base base;
1842 bool include;
1843 uint8_t count;
1844 struct pipe_scissor_state slot[0]; /* more will be allocated if needed */
1845 };
1846
1847 static uint16_t
tc_call_set_window_rectangles(struct pipe_context * pipe,void * call)1848 tc_call_set_window_rectangles(struct pipe_context *pipe, void *call)
1849 {
1850 struct tc_window_rects *p = (struct tc_window_rects *)call;
1851
1852 pipe->set_window_rectangles(pipe, p->include, p->count, p->slot);
1853 return p->base.num_slots;
1854 }
1855
1856 static void
tc_set_window_rectangles(struct pipe_context * _pipe,bool include,unsigned count,const struct pipe_scissor_state * rects)1857 tc_set_window_rectangles(struct pipe_context *_pipe, bool include,
1858 unsigned count,
1859 const struct pipe_scissor_state *rects)
1860 {
1861 struct threaded_context *tc = threaded_context(_pipe);
1862 struct tc_window_rects *p =
1863 tc_add_slot_based_call(tc, TC_CALL_set_window_rectangles, tc_window_rects, count);
1864
1865 p->include = include;
1866 p->count = count;
1867 memcpy(p->slot, rects, count * sizeof(rects[0]));
1868 }
1869
1870 struct tc_sampler_views {
1871 struct tc_call_base base;
1872 uint8_t shader, start, count, unbind_num_trailing_slots;
1873 struct pipe_sampler_view *slot[0]; /* more will be allocated if needed */
1874 };
1875
1876 static uint16_t
tc_call_set_sampler_views(struct pipe_context * pipe,void * call)1877 tc_call_set_sampler_views(struct pipe_context *pipe, void *call)
1878 {
1879 struct tc_sampler_views *p = (struct tc_sampler_views *)call;
1880
1881 pipe->set_sampler_views(pipe, p->shader, p->start, p->count,
1882 p->unbind_num_trailing_slots, true, p->slot);
1883 return p->base.num_slots;
1884 }
1885
1886 static void
tc_set_sampler_views(struct pipe_context * _pipe,enum pipe_shader_type shader,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)1887 tc_set_sampler_views(struct pipe_context *_pipe,
1888 enum pipe_shader_type shader,
1889 unsigned start, unsigned count,
1890 unsigned unbind_num_trailing_slots, bool take_ownership,
1891 struct pipe_sampler_view **views)
1892 {
1893 if (!count && !unbind_num_trailing_slots)
1894 return;
1895
1896 struct threaded_context *tc = threaded_context(_pipe);
1897 struct tc_sampler_views *p =
1898 tc_add_slot_based_call(tc, TC_CALL_set_sampler_views, tc_sampler_views,
1899 views ? count : 0);
1900
1901 p->shader = shader;
1902 p->start = start;
1903
1904 if (views) {
1905 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
1906
1907 p->count = count;
1908 p->unbind_num_trailing_slots = unbind_num_trailing_slots;
1909
1910 if (take_ownership) {
1911 memcpy(p->slot, views, sizeof(*views) * count);
1912
1913 for (unsigned i = 0; i < count; i++) {
1914 if (views[i]) {
1915 if (views[i]->target == PIPE_BUFFER)
1916 tc_bind_buffer(&tc->sampler_buffers[shader][start + i], next,
1917 views[i]->texture);
1918 else
1919 tc_set_resource_batch_usage(tc, views[i]->texture);
1920 } else {
1921 tc_unbind_buffer(&tc->sampler_buffers[shader][start + i]);
1922 }
1923 }
1924 } else {
1925 for (unsigned i = 0; i < count; i++) {
1926 p->slot[i] = NULL;
1927 pipe_sampler_view_reference(&p->slot[i], views[i]);
1928
1929 if (views[i]) {
1930 if (views[i]->target == PIPE_BUFFER)
1931 tc_bind_buffer(&tc->sampler_buffers[shader][start + i], next,
1932 views[i]->texture);
1933 else
1934 tc_set_resource_batch_usage(tc, views[i]->texture);
1935 } else {
1936 tc_unbind_buffer(&tc->sampler_buffers[shader][start + i]);
1937 }
1938 }
1939 }
1940
1941 tc_unbind_buffers(&tc->sampler_buffers[shader][start + count],
1942 unbind_num_trailing_slots);
1943 tc->seen_sampler_buffers[shader] = true;
1944 } else {
1945 p->count = 0;
1946 p->unbind_num_trailing_slots = count + unbind_num_trailing_slots;
1947
1948 tc_unbind_buffers(&tc->sampler_buffers[shader][start],
1949 count + unbind_num_trailing_slots);
1950 }
1951 }
1952
1953 struct tc_shader_images {
1954 struct tc_call_base base;
1955 uint8_t shader, start, count;
1956 uint8_t unbind_num_trailing_slots;
1957 struct pipe_image_view slot[0]; /* more will be allocated if needed */
1958 };
1959
1960 static uint16_t
tc_call_set_shader_images(struct pipe_context * pipe,void * call)1961 tc_call_set_shader_images(struct pipe_context *pipe, void *call)
1962 {
1963 struct tc_shader_images *p = (struct tc_shader_images *)call;
1964 unsigned count = p->count;
1965
1966 if (!p->count) {
1967 pipe->set_shader_images(pipe, p->shader, p->start, 0,
1968 p->unbind_num_trailing_slots, NULL);
1969 return call_size(tc_shader_images);
1970 }
1971
1972 pipe->set_shader_images(pipe, p->shader, p->start, p->count,
1973 p->unbind_num_trailing_slots, p->slot);
1974
1975 for (unsigned i = 0; i < count; i++)
1976 tc_drop_resource_reference(p->slot[i].resource);
1977
1978 return p->base.num_slots;
1979 }
1980
1981 static void
tc_set_shader_images(struct pipe_context * _pipe,enum pipe_shader_type shader,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * images)1982 tc_set_shader_images(struct pipe_context *_pipe,
1983 enum pipe_shader_type shader,
1984 unsigned start, unsigned count,
1985 unsigned unbind_num_trailing_slots,
1986 const struct pipe_image_view *images)
1987 {
1988 if (!count && !unbind_num_trailing_slots)
1989 return;
1990
1991 struct threaded_context *tc = threaded_context(_pipe);
1992 struct tc_shader_images *p =
1993 tc_add_slot_based_call(tc, TC_CALL_set_shader_images, tc_shader_images,
1994 images ? count : 0);
1995 unsigned writable_buffers = 0;
1996
1997 p->shader = shader;
1998 p->start = start;
1999
2000 if (images) {
2001 p->count = count;
2002 p->unbind_num_trailing_slots = unbind_num_trailing_slots;
2003
2004 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
2005
2006 for (unsigned i = 0; i < count; i++) {
2007 struct pipe_resource *resource = images[i].resource;
2008
2009 tc_set_resource_reference(&p->slot[i].resource, resource);
2010
2011 if (resource) {
2012 if (resource->target == PIPE_BUFFER) {
2013 tc_bind_buffer(&tc->image_buffers[shader][start + i], next, resource);
2014
2015 if (images[i].access & PIPE_IMAGE_ACCESS_WRITE) {
2016 struct threaded_resource *tres = threaded_resource(resource);
2017
2018 tc_buffer_disable_cpu_storage(resource);
2019 util_range_add(&tres->b, &tres->valid_buffer_range,
2020 images[i].u.buf.offset,
2021 images[i].u.buf.offset + images[i].u.buf.size);
2022 writable_buffers |= BITFIELD_BIT(start + i);
2023 }
2024 } else {
2025 tc_set_resource_batch_usage(tc, resource);
2026 }
2027 } else {
2028 tc_unbind_buffer(&tc->image_buffers[shader][start + i]);
2029 }
2030 }
2031 memcpy(p->slot, images, count * sizeof(images[0]));
2032
2033 tc_unbind_buffers(&tc->image_buffers[shader][start + count],
2034 unbind_num_trailing_slots);
2035 tc->seen_image_buffers[shader] = true;
2036 } else {
2037 p->count = 0;
2038 p->unbind_num_trailing_slots = count + unbind_num_trailing_slots;
2039
2040 tc_unbind_buffers(&tc->image_buffers[shader][start],
2041 count + unbind_num_trailing_slots);
2042 }
2043
2044 tc->image_buffers_writeable_mask[shader] &= ~BITFIELD_RANGE(start, count);
2045 tc->image_buffers_writeable_mask[shader] |= writable_buffers;
2046 }
2047
2048 struct tc_shader_buffers {
2049 struct tc_call_base base;
2050 uint8_t shader, start, count;
2051 bool unbind;
2052 unsigned writable_bitmask;
2053 struct pipe_shader_buffer slot[0]; /* more will be allocated if needed */
2054 };
2055
2056 static uint16_t
tc_call_set_shader_buffers(struct pipe_context * pipe,void * call)2057 tc_call_set_shader_buffers(struct pipe_context *pipe, void *call)
2058 {
2059 struct tc_shader_buffers *p = (struct tc_shader_buffers *)call;
2060 unsigned count = p->count;
2061
2062 if (p->unbind) {
2063 pipe->set_shader_buffers(pipe, p->shader, p->start, p->count, NULL, 0);
2064 return call_size(tc_shader_buffers);
2065 }
2066
2067 pipe->set_shader_buffers(pipe, p->shader, p->start, p->count, p->slot,
2068 p->writable_bitmask);
2069
2070 for (unsigned i = 0; i < count; i++)
2071 tc_drop_resource_reference(p->slot[i].buffer);
2072
2073 return p->base.num_slots;
2074 }
2075
2076 static void
tc_set_shader_buffers(struct pipe_context * _pipe,enum pipe_shader_type shader,unsigned start,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)2077 tc_set_shader_buffers(struct pipe_context *_pipe,
2078 enum pipe_shader_type shader,
2079 unsigned start, unsigned count,
2080 const struct pipe_shader_buffer *buffers,
2081 unsigned writable_bitmask)
2082 {
2083 if (!count)
2084 return;
2085
2086 struct threaded_context *tc = threaded_context(_pipe);
2087 struct tc_shader_buffers *p =
2088 tc_add_slot_based_call(tc, TC_CALL_set_shader_buffers, tc_shader_buffers,
2089 buffers ? count : 0);
2090
2091 p->shader = shader;
2092 p->start = start;
2093 p->count = count;
2094 p->unbind = buffers == NULL;
2095 p->writable_bitmask = writable_bitmask;
2096
2097 if (buffers) {
2098 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
2099
2100 for (unsigned i = 0; i < count; i++) {
2101 struct pipe_shader_buffer *dst = &p->slot[i];
2102 const struct pipe_shader_buffer *src = buffers + i;
2103
2104 tc_set_resource_reference(&dst->buffer, src->buffer);
2105 dst->buffer_offset = src->buffer_offset;
2106 dst->buffer_size = src->buffer_size;
2107
2108 if (src->buffer) {
2109 struct threaded_resource *tres = threaded_resource(src->buffer);
2110
2111 tc_bind_buffer(&tc->shader_buffers[shader][start + i], next, &tres->b);
2112
2113 if (writable_bitmask & BITFIELD_BIT(i)) {
2114 tc_buffer_disable_cpu_storage(src->buffer);
2115 util_range_add(&tres->b, &tres->valid_buffer_range,
2116 src->buffer_offset,
2117 src->buffer_offset + src->buffer_size);
2118 }
2119 } else {
2120 tc_unbind_buffer(&tc->shader_buffers[shader][start + i]);
2121 }
2122 }
2123 tc->seen_shader_buffers[shader] = true;
2124 } else {
2125 tc_unbind_buffers(&tc->shader_buffers[shader][start], count);
2126 }
2127
2128 tc->shader_buffers_writeable_mask[shader] &= ~BITFIELD_RANGE(start, count);
2129 tc->shader_buffers_writeable_mask[shader] |= writable_bitmask << start;
2130 }
2131
2132 static uint16_t
tc_call_set_vertex_buffers(struct pipe_context * pipe,void * call)2133 tc_call_set_vertex_buffers(struct pipe_context *pipe, void *call)
2134 {
2135 struct tc_vertex_buffers *p = (struct tc_vertex_buffers *)call;
2136 unsigned count = p->count;
2137
2138 for (unsigned i = 0; i < count; i++)
2139 tc_assert(!p->slot[i].is_user_buffer);
2140
2141 pipe->set_vertex_buffers(pipe, count, p->slot);
2142 return p->base.num_slots;
2143 }
2144
2145 static void
tc_set_vertex_buffers(struct pipe_context * _pipe,unsigned count,const struct pipe_vertex_buffer * buffers)2146 tc_set_vertex_buffers(struct pipe_context *_pipe, unsigned count,
2147 const struct pipe_vertex_buffer *buffers)
2148 {
2149 struct threaded_context *tc = threaded_context(_pipe);
2150
2151 assert(!count || buffers);
2152
2153 if (count) {
2154 struct tc_vertex_buffers *p =
2155 tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, count);
2156 p->count = count;
2157
2158 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
2159
2160 memcpy(p->slot, buffers, count * sizeof(struct pipe_vertex_buffer));
2161
2162 for (unsigned i = 0; i < count; i++) {
2163 struct pipe_resource *buf = buffers[i].buffer.resource;
2164
2165 if (buf) {
2166 tc_bind_buffer(&tc->vertex_buffers[i], next, buf);
2167 } else {
2168 tc_unbind_buffer(&tc->vertex_buffers[i]);
2169 }
2170 }
2171 } else {
2172 struct tc_vertex_buffers *p =
2173 tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, 0);
2174 p->count = 0;
2175 }
2176
2177 /* We don't need to unbind trailing buffers because we never touch bindings
2178 * after num_vertex_buffers.
2179 */
2180 tc->num_vertex_buffers = count;
2181 }
2182
2183 struct pipe_vertex_buffer *
tc_add_set_vertex_buffers_call(struct pipe_context * _pipe,unsigned count)2184 tc_add_set_vertex_buffers_call(struct pipe_context *_pipe, unsigned count)
2185 {
2186 struct threaded_context *tc = threaded_context(_pipe);
2187
2188 /* We don't need to unbind trailing buffers because we never touch bindings
2189 * after num_vertex_buffers.
2190 */
2191 tc->num_vertex_buffers = count;
2192
2193 struct tc_vertex_buffers *p =
2194 tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, count);
2195 p->count = count;
2196 return p->slot;
2197 }
2198
2199 struct tc_stream_outputs {
2200 struct tc_call_base base;
2201 uint8_t count;
2202 uint8_t output_prim;
2203 struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
2204 unsigned offsets[PIPE_MAX_SO_BUFFERS];
2205 };
2206
2207 static uint16_t
tc_call_set_stream_output_targets(struct pipe_context * pipe,void * call)2208 tc_call_set_stream_output_targets(struct pipe_context *pipe, void *call)
2209 {
2210 struct tc_stream_outputs *p = to_call(call, tc_stream_outputs);
2211 unsigned count = p->count;
2212
2213 pipe->set_stream_output_targets(pipe, count, p->targets, p->offsets,
2214 p->output_prim);
2215 for (unsigned i = 0; i < count; i++)
2216 tc_drop_so_target_reference(p->targets[i]);
2217
2218 return call_size(tc_stream_outputs);
2219 }
2220
2221 static void
tc_set_stream_output_targets(struct pipe_context * _pipe,unsigned count,struct pipe_stream_output_target ** tgs,const unsigned * offsets,enum mesa_prim output_prim)2222 tc_set_stream_output_targets(struct pipe_context *_pipe,
2223 unsigned count,
2224 struct pipe_stream_output_target **tgs,
2225 const unsigned *offsets,
2226 enum mesa_prim output_prim)
2227 {
2228 struct threaded_context *tc = threaded_context(_pipe);
2229 struct tc_stream_outputs *p =
2230 tc_add_call(tc, TC_CALL_set_stream_output_targets, tc_stream_outputs);
2231 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
2232
2233 for (unsigned i = 0; i < count; i++) {
2234 p->targets[i] = NULL;
2235 pipe_so_target_reference(&p->targets[i], tgs[i]);
2236 if (tgs[i]) {
2237 tc_buffer_disable_cpu_storage(tgs[i]->buffer);
2238 tc_bind_buffer(&tc->streamout_buffers[i], next, tgs[i]->buffer);
2239 } else {
2240 tc_unbind_buffer(&tc->streamout_buffers[i]);
2241 }
2242 }
2243 p->count = count;
2244 p->output_prim = output_prim;
2245 memcpy(p->offsets, offsets, count * sizeof(unsigned));
2246
2247 tc_unbind_buffers(&tc->streamout_buffers[count], PIPE_MAX_SO_BUFFERS - count);
2248 if (count)
2249 tc->seen_streamout_buffers = true;
2250 }
2251
2252 static void
tc_set_compute_resources(struct pipe_context * _pipe,unsigned start,unsigned count,struct pipe_surface ** resources)2253 tc_set_compute_resources(struct pipe_context *_pipe, unsigned start,
2254 unsigned count, struct pipe_surface **resources)
2255 {
2256 struct threaded_context *tc = threaded_context(_pipe);
2257 struct pipe_context *pipe = tc->pipe;
2258
2259 tc_sync(tc);
2260 pipe->set_compute_resources(pipe, start, count, resources);
2261 }
2262
2263 static void
tc_set_global_binding(struct pipe_context * _pipe,unsigned first,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)2264 tc_set_global_binding(struct pipe_context *_pipe, unsigned first,
2265 unsigned count, struct pipe_resource **resources,
2266 uint32_t **handles)
2267 {
2268 struct threaded_context *tc = threaded_context(_pipe);
2269 struct pipe_context *pipe = tc->pipe;
2270
2271 tc_sync(tc);
2272 pipe->set_global_binding(pipe, first, count, resources, handles);
2273 }
2274
2275
2276 /********************************************************************
2277 * views
2278 */
2279
2280 static struct pipe_surface *
tc_create_surface(struct pipe_context * _pipe,struct pipe_resource * resource,const struct pipe_surface * surf_tmpl)2281 tc_create_surface(struct pipe_context *_pipe,
2282 struct pipe_resource *resource,
2283 const struct pipe_surface *surf_tmpl)
2284 {
2285 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2286 struct pipe_surface *view =
2287 pipe->create_surface(pipe, resource, surf_tmpl);
2288
2289 if (view)
2290 view->context = _pipe;
2291 return view;
2292 }
2293
2294 static void
tc_surface_destroy(struct pipe_context * _pipe,struct pipe_surface * surf)2295 tc_surface_destroy(struct pipe_context *_pipe,
2296 struct pipe_surface *surf)
2297 {
2298 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2299
2300 pipe->surface_destroy(pipe, surf);
2301 }
2302
2303 static struct pipe_sampler_view *
tc_create_sampler_view(struct pipe_context * _pipe,struct pipe_resource * resource,const struct pipe_sampler_view * templ)2304 tc_create_sampler_view(struct pipe_context *_pipe,
2305 struct pipe_resource *resource,
2306 const struct pipe_sampler_view *templ)
2307 {
2308 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2309 struct pipe_sampler_view *view =
2310 pipe->create_sampler_view(pipe, resource, templ);
2311
2312 if (view)
2313 view->context = _pipe;
2314 return view;
2315 }
2316
2317 static void
tc_sampler_view_destroy(struct pipe_context * _pipe,struct pipe_sampler_view * view)2318 tc_sampler_view_destroy(struct pipe_context *_pipe,
2319 struct pipe_sampler_view *view)
2320 {
2321 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2322
2323 pipe->sampler_view_destroy(pipe, view);
2324 }
2325
2326 static struct pipe_stream_output_target *
tc_create_stream_output_target(struct pipe_context * _pipe,struct pipe_resource * res,unsigned buffer_offset,unsigned buffer_size)2327 tc_create_stream_output_target(struct pipe_context *_pipe,
2328 struct pipe_resource *res,
2329 unsigned buffer_offset,
2330 unsigned buffer_size)
2331 {
2332 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2333 struct threaded_resource *tres = threaded_resource(res);
2334 struct pipe_stream_output_target *view;
2335
2336 util_range_add(&tres->b, &tres->valid_buffer_range, buffer_offset,
2337 buffer_offset + buffer_size);
2338
2339 view = pipe->create_stream_output_target(pipe, res, buffer_offset,
2340 buffer_size);
2341 if (view)
2342 view->context = _pipe;
2343 return view;
2344 }
2345
2346 static void
tc_stream_output_target_destroy(struct pipe_context * _pipe,struct pipe_stream_output_target * target)2347 tc_stream_output_target_destroy(struct pipe_context *_pipe,
2348 struct pipe_stream_output_target *target)
2349 {
2350 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2351
2352 pipe->stream_output_target_destroy(pipe, target);
2353 }
2354
2355
2356 /********************************************************************
2357 * bindless
2358 */
2359
2360 static uint64_t
tc_create_texture_handle(struct pipe_context * _pipe,struct pipe_sampler_view * view,const struct pipe_sampler_state * state)2361 tc_create_texture_handle(struct pipe_context *_pipe,
2362 struct pipe_sampler_view *view,
2363 const struct pipe_sampler_state *state)
2364 {
2365 struct threaded_context *tc = threaded_context(_pipe);
2366 struct pipe_context *pipe = tc->pipe;
2367
2368 tc_sync(tc);
2369 return pipe->create_texture_handle(pipe, view, state);
2370 }
2371
2372 struct tc_make_texture_handle_resident {
2373 struct tc_call_base base;
2374 bool resident;
2375 uint64_t handle;
2376 };
2377
2378 static uint16_t
tc_call_make_texture_handle_resident(struct pipe_context * pipe,void * call)2379 tc_call_make_texture_handle_resident(struct pipe_context *pipe, void *call)
2380 {
2381 struct tc_make_texture_handle_resident *p =
2382 to_call(call, tc_make_texture_handle_resident);
2383
2384 pipe->make_texture_handle_resident(pipe, p->handle, p->resident);
2385 return call_size(tc_make_texture_handle_resident);
2386 }
2387
2388 static void
tc_make_texture_handle_resident(struct pipe_context * _pipe,uint64_t handle,bool resident)2389 tc_make_texture_handle_resident(struct pipe_context *_pipe, uint64_t handle,
2390 bool resident)
2391 {
2392 struct threaded_context *tc = threaded_context(_pipe);
2393 struct tc_make_texture_handle_resident *p =
2394 tc_add_call(tc, TC_CALL_make_texture_handle_resident,
2395 tc_make_texture_handle_resident);
2396
2397 p->handle = handle;
2398 p->resident = resident;
2399 }
2400
2401 static uint64_t
tc_create_image_handle(struct pipe_context * _pipe,const struct pipe_image_view * image)2402 tc_create_image_handle(struct pipe_context *_pipe,
2403 const struct pipe_image_view *image)
2404 {
2405 struct threaded_context *tc = threaded_context(_pipe);
2406 struct pipe_context *pipe = tc->pipe;
2407 struct pipe_resource *resource = image->resource;
2408
2409 if (image->access & PIPE_IMAGE_ACCESS_WRITE &&
2410 resource && resource->target == PIPE_BUFFER) {
2411 struct threaded_resource *tres = threaded_resource(resource);
2412
2413 /* The CPU storage doesn't support writable buffer. */
2414 tc_buffer_disable_cpu_storage(resource);
2415
2416 util_range_add(&tres->b, &tres->valid_buffer_range,
2417 image->u.buf.offset,
2418 image->u.buf.offset + image->u.buf.size);
2419 }
2420
2421 tc_sync(tc);
2422 return pipe->create_image_handle(pipe, image);
2423 }
2424
2425 struct tc_make_image_handle_resident {
2426 struct tc_call_base base;
2427 bool resident;
2428 unsigned access;
2429 uint64_t handle;
2430 };
2431
2432 static uint16_t
tc_call_make_image_handle_resident(struct pipe_context * pipe,void * call)2433 tc_call_make_image_handle_resident(struct pipe_context *pipe, void *call)
2434 {
2435 struct tc_make_image_handle_resident *p =
2436 to_call(call, tc_make_image_handle_resident);
2437
2438 pipe->make_image_handle_resident(pipe, p->handle, p->access, p->resident);
2439 return call_size(tc_make_image_handle_resident);
2440 }
2441
2442 static void
tc_make_image_handle_resident(struct pipe_context * _pipe,uint64_t handle,unsigned access,bool resident)2443 tc_make_image_handle_resident(struct pipe_context *_pipe, uint64_t handle,
2444 unsigned access, bool resident)
2445 {
2446 struct threaded_context *tc = threaded_context(_pipe);
2447 struct tc_make_image_handle_resident *p =
2448 tc_add_call(tc, TC_CALL_make_image_handle_resident,
2449 tc_make_image_handle_resident);
2450
2451 p->handle = handle;
2452 p->access = access;
2453 p->resident = resident;
2454 }
2455
2456
2457 /********************************************************************
2458 * transfer
2459 */
2460
2461 static void
2462 tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
2463 unsigned flags);
2464
2465 struct tc_replace_buffer_storage {
2466 struct tc_call_base base;
2467 uint16_t num_rebinds;
2468 uint32_t rebind_mask;
2469 uint32_t delete_buffer_id;
2470 struct pipe_resource *dst;
2471 struct pipe_resource *src;
2472 tc_replace_buffer_storage_func func;
2473 };
2474
2475 static uint16_t
tc_call_replace_buffer_storage(struct pipe_context * pipe,void * call)2476 tc_call_replace_buffer_storage(struct pipe_context *pipe, void *call)
2477 {
2478 struct tc_replace_buffer_storage *p = to_call(call, tc_replace_buffer_storage);
2479
2480 p->func(pipe, p->dst, p->src, p->num_rebinds, p->rebind_mask, p->delete_buffer_id);
2481
2482 tc_drop_resource_reference(p->dst);
2483 tc_drop_resource_reference(p->src);
2484 return call_size(tc_replace_buffer_storage);
2485 }
2486
2487 /* Return true if the buffer has been invalidated or is idle. */
2488 static bool
tc_invalidate_buffer(struct threaded_context * tc,struct threaded_resource * tbuf)2489 tc_invalidate_buffer(struct threaded_context *tc,
2490 struct threaded_resource *tbuf)
2491 {
2492 if (!tc_is_buffer_busy(tc, tbuf, PIPE_MAP_READ_WRITE)) {
2493 /* It's idle, so invalidation would be a no-op, but we can still clear
2494 * the valid range because we are technically doing invalidation, but
2495 * skipping it because it's useless.
2496 *
2497 * If the buffer is bound for write, we can't invalidate the range.
2498 */
2499 if (!tc_is_buffer_bound_for_write(tc, tbuf->buffer_id_unique))
2500 util_range_set_empty(&tbuf->valid_buffer_range);
2501 return true;
2502 }
2503
2504 struct pipe_screen *screen = tc->base.screen;
2505 struct pipe_resource *new_buf;
2506
2507 /* Shared, pinned, and sparse buffers can't be reallocated. */
2508 if (tbuf->is_shared ||
2509 tbuf->is_user_ptr ||
2510 tbuf->b.flags & (PIPE_RESOURCE_FLAG_SPARSE | PIPE_RESOURCE_FLAG_UNMAPPABLE))
2511 return false;
2512
2513 assert(tbuf->b.target == PIPE_BUFFER);
2514 tc->bytes_replaced_estimate += tbuf->b.width0;
2515
2516 if (tc->bytes_replaced_limit && (tc->bytes_replaced_estimate > tc->bytes_replaced_limit)) {
2517 tc_flush(&tc->base, NULL, PIPE_FLUSH_ASYNC);
2518 }
2519
2520 /* Allocate a new one. */
2521 new_buf = screen->resource_create(screen, &tbuf->b);
2522 if (!new_buf)
2523 return false;
2524
2525 /* Replace the "latest" pointer. */
2526 if (tbuf->latest != &tbuf->b)
2527 pipe_resource_reference(&tbuf->latest, NULL);
2528
2529 tbuf->latest = new_buf;
2530
2531 uint32_t delete_buffer_id = tbuf->buffer_id_unique;
2532
2533 /* Enqueue storage replacement of the original buffer. */
2534 struct tc_replace_buffer_storage *p =
2535 tc_add_call(tc, TC_CALL_replace_buffer_storage,
2536 tc_replace_buffer_storage);
2537
2538 p->func = tc->replace_buffer_storage;
2539 tc_set_resource_reference(&p->dst, &tbuf->b);
2540 tc_set_resource_reference(&p->src, new_buf);
2541 p->delete_buffer_id = delete_buffer_id;
2542 p->rebind_mask = 0;
2543
2544 /* Treat the current buffer as the new buffer. */
2545 bool bound_for_write = tc_is_buffer_bound_for_write(tc, tbuf->buffer_id_unique);
2546 p->num_rebinds = tc_rebind_buffer(tc, tbuf->buffer_id_unique,
2547 threaded_resource(new_buf)->buffer_id_unique,
2548 &p->rebind_mask);
2549
2550 /* If the buffer is not bound for write, clear the valid range. */
2551 if (!bound_for_write)
2552 util_range_set_empty(&tbuf->valid_buffer_range);
2553
2554 tbuf->buffer_id_unique = threaded_resource(new_buf)->buffer_id_unique;
2555 threaded_resource(new_buf)->buffer_id_unique = 0;
2556
2557 return true;
2558 }
2559
2560 static unsigned
tc_improve_map_buffer_flags(struct threaded_context * tc,struct threaded_resource * tres,unsigned usage,unsigned offset,unsigned size)2561 tc_improve_map_buffer_flags(struct threaded_context *tc,
2562 struct threaded_resource *tres, unsigned usage,
2563 unsigned offset, unsigned size)
2564 {
2565 /* Never invalidate inside the driver and never infer "unsynchronized". */
2566 unsigned tc_flags = TC_TRANSFER_MAP_NO_INVALIDATE |
2567 TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED;
2568
2569 /* Prevent a reentry. */
2570 if (usage & tc_flags)
2571 return usage;
2572
2573 /* Use the staging upload if it's preferred. */
2574 if (usage & (PIPE_MAP_DISCARD_RANGE |
2575 PIPE_MAP_DISCARD_WHOLE_RESOURCE) &&
2576 !(usage & PIPE_MAP_PERSISTENT) &&
2577 tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY &&
2578 tc->use_forced_staging_uploads) {
2579 usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE |
2580 PIPE_MAP_UNSYNCHRONIZED);
2581
2582 return usage | tc_flags | PIPE_MAP_DISCARD_RANGE;
2583 }
2584
2585 /* Sparse buffers can't be mapped directly and can't be reallocated
2586 * (fully invalidated). That may just be a radeonsi limitation, but
2587 * the threaded context must obey it with radeonsi.
2588 */
2589 if (tres->b.flags & (PIPE_RESOURCE_FLAG_SPARSE | PIPE_RESOURCE_FLAG_UNMAPPABLE)) {
2590 /* We can use DISCARD_RANGE instead of full discard. This is the only
2591 * fast path for sparse buffers that doesn't need thread synchronization.
2592 */
2593 if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE)
2594 usage |= PIPE_MAP_DISCARD_RANGE;
2595
2596 /* Allow DISCARD_WHOLE_RESOURCE and infering UNSYNCHRONIZED in drivers.
2597 * The threaded context doesn't do unsychronized mappings and invalida-
2598 * tions of sparse buffers, therefore a correct driver behavior won't
2599 * result in an incorrect behavior with the threaded context.
2600 */
2601 return usage;
2602 }
2603
2604 usage |= tc_flags;
2605
2606 /* Handle CPU reads trivially. */
2607 if (usage & PIPE_MAP_READ) {
2608 if (usage & PIPE_MAP_UNSYNCHRONIZED)
2609 usage |= TC_TRANSFER_MAP_THREADED_UNSYNC; /* don't sync */
2610
2611 /* Drivers aren't allowed to do buffer invalidations. */
2612 return usage & ~PIPE_MAP_DISCARD_WHOLE_RESOURCE;
2613 }
2614
2615 /* See if the buffer range being mapped has never been initialized or
2616 * the buffer is idle, in which case it can be mapped unsynchronized. */
2617 if (!(usage & PIPE_MAP_UNSYNCHRONIZED) &&
2618 ((!tres->is_shared &&
2619 !util_ranges_intersect(&tres->valid_buffer_range, offset, offset + size)) ||
2620 !tc_is_buffer_busy(tc, tres, usage)))
2621 usage |= PIPE_MAP_UNSYNCHRONIZED;
2622
2623 if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
2624 /* If discarding the entire valid range, discard the whole resource instead. */
2625 if (usage & PIPE_MAP_DISCARD_RANGE &&
2626 util_ranges_covered(&tres->valid_buffer_range, offset, offset + size))
2627 usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE;
2628
2629 /* Discard the whole resource if needed. */
2630 if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) {
2631 if (tc_invalidate_buffer(tc, tres))
2632 usage |= PIPE_MAP_UNSYNCHRONIZED;
2633 else
2634 usage |= PIPE_MAP_DISCARD_RANGE; /* fallback */
2635 }
2636 }
2637
2638 /* We won't need this flag anymore. */
2639 /* TODO: We might not need TC_TRANSFER_MAP_NO_INVALIDATE with this. */
2640 usage &= ~PIPE_MAP_DISCARD_WHOLE_RESOURCE;
2641
2642 /* GL_AMD_pinned_memory and persistent mappings can't use staging
2643 * buffers. */
2644 if (usage & (PIPE_MAP_UNSYNCHRONIZED |
2645 PIPE_MAP_PERSISTENT) ||
2646 tres->is_user_ptr)
2647 usage &= ~PIPE_MAP_DISCARD_RANGE;
2648
2649 /* Unsychronized buffer mappings don't have to synchronize the thread. */
2650 if (usage & PIPE_MAP_UNSYNCHRONIZED) {
2651 usage &= ~PIPE_MAP_DISCARD_RANGE;
2652 usage |= TC_TRANSFER_MAP_THREADED_UNSYNC; /* notify the driver */
2653 }
2654
2655 return usage;
2656 }
2657
2658 static void *
tc_buffer_map(struct pipe_context * _pipe,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** transfer)2659 tc_buffer_map(struct pipe_context *_pipe,
2660 struct pipe_resource *resource, unsigned level,
2661 unsigned usage, const struct pipe_box *box,
2662 struct pipe_transfer **transfer)
2663 {
2664 struct threaded_context *tc = threaded_context(_pipe);
2665 struct threaded_resource *tres = threaded_resource(resource);
2666 struct pipe_context *pipe = tc->pipe;
2667
2668 /* PIPE_MAP_THREAD_SAFE is for glthread, which shouldn't use the CPU storage and
2669 * this shouldn't normally be necessary because glthread only uses large buffers.
2670 */
2671 if (usage & PIPE_MAP_THREAD_SAFE)
2672 tc_buffer_disable_cpu_storage(resource);
2673
2674 usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width);
2675
2676 /* If the CPU storage is enabled, return it directly. */
2677 if (tres->allow_cpu_storage && !(usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
2678 /* We can't let resource_copy_region disable the CPU storage. */
2679 assert(!(tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY));
2680
2681 if (!tres->cpu_storage) {
2682 tres->cpu_storage = align_malloc(resource->width0, tc->map_buffer_alignment);
2683
2684 if (tres->cpu_storage && tres->valid_buffer_range.end) {
2685 /* The GPU buffer contains valid data. Copy them to the CPU storage. */
2686 struct pipe_box box2;
2687 struct pipe_transfer *transfer2;
2688
2689 unsigned valid_range_len = tres->valid_buffer_range.end - tres->valid_buffer_range.start;
2690 u_box_1d(tres->valid_buffer_range.start, valid_range_len, &box2);
2691
2692 tc_sync_msg(tc, "cpu storage GPU -> CPU copy");
2693 tc_set_driver_thread(tc);
2694
2695 void *ret = pipe->buffer_map(pipe, tres->latest ? tres->latest : resource,
2696 0, PIPE_MAP_READ, &box2, &transfer2);
2697 memcpy(&((uint8_t*)tres->cpu_storage)[tres->valid_buffer_range.start],
2698 ret,
2699 valid_range_len);
2700 pipe->buffer_unmap(pipe, transfer2);
2701
2702 tc_clear_driver_thread(tc);
2703 }
2704 }
2705
2706 if (tres->cpu_storage) {
2707 struct threaded_transfer *ttrans = slab_zalloc(&tc->pool_transfers);
2708 ttrans->b.resource = resource;
2709 ttrans->b.usage = usage;
2710 ttrans->b.box = *box;
2711 ttrans->valid_buffer_range = &tres->valid_buffer_range;
2712 ttrans->cpu_storage_mapped = true;
2713 *transfer = &ttrans->b;
2714
2715 return (uint8_t*)tres->cpu_storage + box->x;
2716 } else {
2717 tres->allow_cpu_storage = false;
2718 }
2719 }
2720
2721 /* Do a staging transfer within the threaded context. The driver should
2722 * only get resource_copy_region.
2723 */
2724 if (usage & PIPE_MAP_DISCARD_RANGE) {
2725 struct threaded_transfer *ttrans = slab_zalloc(&tc->pool_transfers);
2726 uint8_t *map;
2727
2728 u_upload_alloc(tc->base.stream_uploader, 0,
2729 box->width + (box->x % tc->map_buffer_alignment),
2730 tc->map_buffer_alignment, &ttrans->b.offset,
2731 &ttrans->staging, (void**)&map);
2732 if (!map) {
2733 slab_free(&tc->pool_transfers, ttrans);
2734 return NULL;
2735 }
2736
2737 ttrans->b.resource = resource;
2738 ttrans->b.level = 0;
2739 ttrans->b.usage = usage;
2740 ttrans->b.box = *box;
2741 ttrans->b.stride = 0;
2742 ttrans->b.layer_stride = 0;
2743 ttrans->valid_buffer_range = &tres->valid_buffer_range;
2744 ttrans->cpu_storage_mapped = false;
2745 *transfer = &ttrans->b;
2746
2747 p_atomic_inc(&tres->pending_staging_uploads);
2748 util_range_add(resource, &tres->pending_staging_uploads_range,
2749 box->x, box->x + box->width);
2750
2751 return map + (box->x % tc->map_buffer_alignment);
2752 }
2753
2754 if (usage & PIPE_MAP_UNSYNCHRONIZED &&
2755 p_atomic_read(&tres->pending_staging_uploads) &&
2756 util_ranges_intersect(&tres->pending_staging_uploads_range, box->x, box->x + box->width)) {
2757 /* Write conflict detected between a staging transfer and the direct mapping we're
2758 * going to do. Resolve the conflict by ignoring UNSYNCHRONIZED so the direct mapping
2759 * will have to wait for the staging transfer completion.
2760 * Note: The conflict detection is only based on the mapped range, not on the actual
2761 * written range(s).
2762 */
2763 usage &= ~PIPE_MAP_UNSYNCHRONIZED & ~TC_TRANSFER_MAP_THREADED_UNSYNC;
2764 tc->use_forced_staging_uploads = false;
2765 }
2766
2767 /* Unsychronized buffer mappings don't have to synchronize the thread. */
2768 if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC)) {
2769 tc_sync_msg(tc, usage & PIPE_MAP_DISCARD_RANGE ? " discard_range" :
2770 usage & PIPE_MAP_READ ? " read" : " staging conflict");
2771 tc_set_driver_thread(tc);
2772 }
2773
2774 tc->bytes_mapped_estimate += box->width;
2775
2776 void *ret = pipe->buffer_map(pipe, tres->latest ? tres->latest : resource,
2777 level, usage, box, transfer);
2778 threaded_transfer(*transfer)->valid_buffer_range = &tres->valid_buffer_range;
2779 threaded_transfer(*transfer)->cpu_storage_mapped = false;
2780
2781 if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
2782 tc_clear_driver_thread(tc);
2783
2784 return ret;
2785 }
2786
2787 static void *
tc_texture_map(struct pipe_context * _pipe,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** transfer)2788 tc_texture_map(struct pipe_context *_pipe,
2789 struct pipe_resource *resource, unsigned level,
2790 unsigned usage, const struct pipe_box *box,
2791 struct pipe_transfer **transfer)
2792 {
2793 struct threaded_context *tc = threaded_context(_pipe);
2794 struct threaded_resource *tres = threaded_resource(resource);
2795 struct pipe_context *pipe = tc->pipe;
2796
2797 tc_sync_msg(tc, "texture");
2798 tc_set_driver_thread(tc);
2799 /* block all unsync texture subdata during map */
2800 tc_set_resource_batch_usage_persistent(tc, resource, true);
2801
2802 tc->bytes_mapped_estimate += box->width;
2803
2804 void *ret = pipe->texture_map(pipe, tres->latest ? tres->latest : resource,
2805 level, usage, box, transfer);
2806
2807 if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
2808 tc_clear_driver_thread(tc);
2809
2810 return ret;
2811 }
2812
2813 struct tc_transfer_flush_region {
2814 struct tc_call_base base;
2815 struct pipe_box box;
2816 struct pipe_transfer *transfer;
2817 };
2818
2819 static uint16_t
tc_call_transfer_flush_region(struct pipe_context * pipe,void * call)2820 tc_call_transfer_flush_region(struct pipe_context *pipe, void *call)
2821 {
2822 struct tc_transfer_flush_region *p = to_call(call, tc_transfer_flush_region);
2823
2824 pipe->transfer_flush_region(pipe, p->transfer, &p->box);
2825 return call_size(tc_transfer_flush_region);
2826 }
2827
2828 struct tc_resource_copy_region {
2829 struct tc_call_base base;
2830 unsigned dst_level;
2831 unsigned dstx, dsty, dstz;
2832 unsigned src_level;
2833 struct pipe_box src_box;
2834 struct pipe_resource *dst;
2835 struct pipe_resource *src;
2836 };
2837
2838 static void
2839 tc_resource_copy_region(struct pipe_context *_pipe,
2840 struct pipe_resource *dst, unsigned dst_level,
2841 unsigned dstx, unsigned dsty, unsigned dstz,
2842 struct pipe_resource *src, unsigned src_level,
2843 const struct pipe_box *src_box);
2844
2845 static void
tc_buffer_do_flush_region(struct threaded_context * tc,struct threaded_transfer * ttrans,const struct pipe_box * box)2846 tc_buffer_do_flush_region(struct threaded_context *tc,
2847 struct threaded_transfer *ttrans,
2848 const struct pipe_box *box)
2849 {
2850 struct threaded_resource *tres = threaded_resource(ttrans->b.resource);
2851
2852 if (ttrans->staging) {
2853 struct pipe_box src_box;
2854
2855 u_box_1d(ttrans->b.offset + ttrans->b.box.x % tc->map_buffer_alignment +
2856 (box->x - ttrans->b.box.x),
2857 box->width, &src_box);
2858
2859 /* Copy the staging buffer into the original one. */
2860 tc_resource_copy_region(&tc->base, ttrans->b.resource, 0, box->x, 0, 0,
2861 ttrans->staging, 0, &src_box);
2862 }
2863
2864 /* Don't update the valid range when we're uploading the CPU storage
2865 * because it includes the uninitialized range too.
2866 */
2867 if (!(ttrans->b.usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
2868 util_range_add(&tres->b, ttrans->valid_buffer_range,
2869 box->x, box->x + box->width);
2870 }
2871 }
2872
2873 static void
tc_transfer_flush_region(struct pipe_context * _pipe,struct pipe_transfer * transfer,const struct pipe_box * rel_box)2874 tc_transfer_flush_region(struct pipe_context *_pipe,
2875 struct pipe_transfer *transfer,
2876 const struct pipe_box *rel_box)
2877 {
2878 struct threaded_context *tc = threaded_context(_pipe);
2879 struct threaded_transfer *ttrans = threaded_transfer(transfer);
2880 struct threaded_resource *tres = threaded_resource(transfer->resource);
2881 unsigned required_usage = PIPE_MAP_WRITE |
2882 PIPE_MAP_FLUSH_EXPLICIT;
2883
2884 if (tres->b.target == PIPE_BUFFER) {
2885 if ((transfer->usage & required_usage) == required_usage) {
2886 struct pipe_box box;
2887
2888 u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
2889 tc_buffer_do_flush_region(tc, ttrans, &box);
2890 }
2891
2892 /* Staging transfers don't send the call to the driver.
2893 *
2894 * Transfers using the CPU storage shouldn't call transfer_flush_region
2895 * in the driver because the buffer is not really mapped on the driver
2896 * side and the CPU storage always re-uploads everything (flush_region
2897 * makes no difference).
2898 */
2899 if (ttrans->staging || ttrans->cpu_storage_mapped)
2900 return;
2901 }
2902
2903 struct tc_transfer_flush_region *p =
2904 tc_add_call(tc, TC_CALL_transfer_flush_region, tc_transfer_flush_region);
2905 p->transfer = transfer;
2906 p->box = *rel_box;
2907 }
2908
2909 struct tc_buffer_unmap {
2910 struct tc_call_base base;
2911 bool was_staging_transfer;
2912 union {
2913 struct pipe_transfer *transfer;
2914 struct pipe_resource *resource;
2915 };
2916 };
2917
2918 static uint16_t
tc_call_buffer_unmap(struct pipe_context * pipe,void * call)2919 tc_call_buffer_unmap(struct pipe_context *pipe, void *call)
2920 {
2921 struct tc_buffer_unmap *p = to_call(call, tc_buffer_unmap);
2922
2923 if (p->was_staging_transfer) {
2924 struct threaded_resource *tres = threaded_resource(p->resource);
2925 /* Nothing to do except keeping track of staging uploads */
2926 assert(tres->pending_staging_uploads > 0);
2927 p_atomic_dec(&tres->pending_staging_uploads);
2928 tc_drop_resource_reference(p->resource);
2929 } else {
2930 pipe->buffer_unmap(pipe, p->transfer);
2931 }
2932
2933 return call_size(tc_buffer_unmap);
2934 }
2935
2936 static void
tc_buffer_unmap(struct pipe_context * _pipe,struct pipe_transfer * transfer)2937 tc_buffer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
2938 {
2939 struct threaded_context *tc = threaded_context(_pipe);
2940 struct threaded_transfer *ttrans = threaded_transfer(transfer);
2941 struct threaded_resource *tres = threaded_resource(transfer->resource);
2942
2943 /* PIPE_MAP_THREAD_SAFE is only valid with UNSYNCHRONIZED. It can be
2944 * called from any thread and bypasses all multithreaded queues.
2945 */
2946 if (transfer->usage & PIPE_MAP_THREAD_SAFE) {
2947 assert(transfer->usage & PIPE_MAP_UNSYNCHRONIZED);
2948 assert(!(transfer->usage & (PIPE_MAP_FLUSH_EXPLICIT |
2949 PIPE_MAP_DISCARD_RANGE)));
2950
2951 struct pipe_context *pipe = tc->pipe;
2952 util_range_add(&tres->b, ttrans->valid_buffer_range,
2953 transfer->box.x, transfer->box.x + transfer->box.width);
2954
2955 pipe->buffer_unmap(pipe, transfer);
2956 return;
2957 }
2958
2959 if (transfer->usage & PIPE_MAP_WRITE &&
2960 !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT))
2961 tc_buffer_do_flush_region(tc, ttrans, &transfer->box);
2962
2963 if (ttrans->cpu_storage_mapped) {
2964 /* GL allows simultaneous GPU stores with mapped buffers as long as GPU stores don't
2965 * touch the mapped range. That's a problem because GPU stores free the CPU storage.
2966 * If that happens, we just ignore the unmap call and don't upload anything to prevent
2967 * a crash.
2968 *
2969 * Disallow the CPU storage in the driver to work around this.
2970 */
2971 assert(tres->cpu_storage);
2972
2973 if (tres->cpu_storage) {
2974 tc_invalidate_buffer(tc, tres);
2975 tc_buffer_subdata(&tc->base, &tres->b,
2976 PIPE_MAP_UNSYNCHRONIZED |
2977 TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE,
2978 0, tres->b.width0, tres->cpu_storage);
2979 /* This shouldn't have been freed by buffer_subdata. */
2980 assert(tres->cpu_storage);
2981 } else {
2982 static bool warned_once = false;
2983 if (!warned_once) {
2984 fprintf(stderr, "This application is incompatible with cpu_storage.\n");
2985 fprintf(stderr, "Use tc_max_cpu_storage_size=0 to disable it and report this issue to Mesa.\n");
2986 warned_once = true;
2987 }
2988 }
2989
2990 tc_drop_resource_reference(ttrans->staging);
2991 slab_free(&tc->pool_transfers, ttrans);
2992 return;
2993 }
2994
2995 bool was_staging_transfer = false;
2996
2997 if (ttrans->staging) {
2998 was_staging_transfer = true;
2999
3000 tc_drop_resource_reference(ttrans->staging);
3001 slab_free(&tc->pool_transfers, ttrans);
3002 }
3003
3004 struct tc_buffer_unmap *p = tc_add_call(tc, TC_CALL_buffer_unmap,
3005 tc_buffer_unmap);
3006 if (was_staging_transfer) {
3007 tc_set_resource_reference(&p->resource, &tres->b);
3008 p->was_staging_transfer = true;
3009 } else {
3010 p->transfer = transfer;
3011 p->was_staging_transfer = false;
3012 }
3013
3014 /* tc_buffer_map directly maps the buffers, but tc_buffer_unmap
3015 * defers the unmap operation to the batch execution.
3016 * bytes_mapped_estimate is an estimation of the map/unmap bytes delta
3017 * and if it goes over an optional limit the current batch is flushed,
3018 * to reclaim some RAM. */
3019 if (!ttrans->staging && tc->bytes_mapped_limit &&
3020 tc->bytes_mapped_estimate > tc->bytes_mapped_limit) {
3021 tc_flush(_pipe, NULL, PIPE_FLUSH_ASYNC);
3022 }
3023 }
3024
3025 struct tc_texture_unmap {
3026 struct tc_call_base base;
3027 struct pipe_transfer *transfer;
3028 };
3029
3030 static uint16_t
tc_call_texture_unmap(struct pipe_context * pipe,void * call)3031 tc_call_texture_unmap(struct pipe_context *pipe, void *call)
3032 {
3033 struct tc_texture_unmap *p = (struct tc_texture_unmap *) call;
3034
3035 pipe->texture_unmap(pipe, p->transfer);
3036 return call_size(tc_texture_unmap);
3037 }
3038
3039 static void
tc_texture_unmap(struct pipe_context * _pipe,struct pipe_transfer * transfer)3040 tc_texture_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
3041 {
3042 struct threaded_context *tc = threaded_context(_pipe);
3043 struct threaded_transfer *ttrans = threaded_transfer(transfer);
3044
3045 /* enable subdata again once resource is no longer mapped */
3046 tc_set_resource_batch_usage_persistent(tc, transfer->resource, false);
3047
3048 tc_add_call(tc, TC_CALL_texture_unmap, tc_texture_unmap)->transfer = transfer;
3049
3050 /* tc_texture_map directly maps the textures, but tc_texture_unmap
3051 * defers the unmap operation to the batch execution.
3052 * bytes_mapped_estimate is an estimation of the map/unmap bytes delta
3053 * and if it goes over an optional limit the current batch is flushed,
3054 * to reclaim some RAM. */
3055 if (!ttrans->staging && tc->bytes_mapped_limit &&
3056 tc->bytes_mapped_estimate > tc->bytes_mapped_limit) {
3057 tc_flush(_pipe, NULL, PIPE_FLUSH_ASYNC);
3058 }
3059 }
3060
3061 struct tc_buffer_subdata {
3062 struct tc_call_base base;
3063 unsigned usage, offset, size;
3064 struct pipe_resource *resource;
3065 char slot[0]; /* more will be allocated if needed */
3066 };
3067
3068 static uint16_t
tc_call_buffer_subdata(struct pipe_context * pipe,void * call)3069 tc_call_buffer_subdata(struct pipe_context *pipe, void *call)
3070 {
3071 struct tc_buffer_subdata *p = (struct tc_buffer_subdata *)call;
3072
3073 pipe->buffer_subdata(pipe, p->resource, p->usage, p->offset, p->size,
3074 p->slot);
3075 tc_drop_resource_reference(p->resource);
3076 return p->base.num_slots;
3077 }
3078
3079 static bool
is_mergeable_buffer_subdata(const struct tc_call_base * previous_call,unsigned usage,unsigned offset,struct pipe_resource * resource)3080 is_mergeable_buffer_subdata(const struct tc_call_base *previous_call,
3081 unsigned usage, unsigned offset,
3082 struct pipe_resource *resource)
3083 {
3084 if (!previous_call || previous_call->call_id != TC_CALL_buffer_subdata)
3085 return false;
3086
3087 struct tc_buffer_subdata *subdata = (struct tc_buffer_subdata *)previous_call;
3088
3089 return subdata->usage == usage && subdata->resource == resource
3090 && (subdata->offset + subdata->size) == offset;
3091 }
3092
3093 static void
tc_buffer_subdata(struct pipe_context * _pipe,struct pipe_resource * resource,unsigned usage,unsigned offset,unsigned size,const void * data)3094 tc_buffer_subdata(struct pipe_context *_pipe,
3095 struct pipe_resource *resource,
3096 unsigned usage, unsigned offset,
3097 unsigned size, const void *data)
3098 {
3099 struct threaded_context *tc = threaded_context(_pipe);
3100 struct threaded_resource *tres = threaded_resource(resource);
3101
3102 if (!size)
3103 return;
3104
3105 usage |= PIPE_MAP_WRITE;
3106
3107 /* PIPE_MAP_DIRECTLY supresses implicit DISCARD_RANGE. */
3108 if (!(usage & PIPE_MAP_DIRECTLY))
3109 usage |= PIPE_MAP_DISCARD_RANGE;
3110
3111 usage = tc_improve_map_buffer_flags(tc, tres, usage, offset, size);
3112
3113 /* Unsychronized and big transfers should use transfer_map. Also handle
3114 * full invalidations, because drivers aren't allowed to do them.
3115 */
3116 if (usage & (PIPE_MAP_UNSYNCHRONIZED |
3117 PIPE_MAP_DISCARD_WHOLE_RESOURCE) ||
3118 size > TC_MAX_SUBDATA_BYTES ||
3119 tres->cpu_storage) {
3120 struct pipe_transfer *transfer;
3121 struct pipe_box box;
3122 uint8_t *map = NULL;
3123
3124 u_box_1d(offset, size, &box);
3125
3126 /* CPU storage is only useful for partial updates. It can add overhead
3127 * on glBufferData calls so avoid using it.
3128 */
3129 if (!tres->cpu_storage && offset == 0 && size == resource->width0)
3130 usage |= TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE;
3131
3132 map = tc_buffer_map(_pipe, resource, 0, usage, &box, &transfer);
3133 if (map) {
3134 memcpy(map, data, size);
3135 tc_buffer_unmap(_pipe, transfer);
3136 }
3137 return;
3138 }
3139
3140 util_range_add(&tres->b, &tres->valid_buffer_range, offset, offset + size);
3141
3142 /* We can potentially merge this subdata call with the previous one (if any),
3143 * if the application does a whole-buffer upload piecewise. */
3144 {
3145 struct tc_call_base *last_call = tc_get_last_mergeable_call(tc);
3146 struct tc_buffer_subdata *merge_dest = (struct tc_buffer_subdata *)last_call;
3147
3148 if (is_mergeable_buffer_subdata(last_call, usage, offset, resource) &&
3149 tc_enlarge_last_mergeable_call(tc, call_size_with_slots(tc_buffer_subdata, merge_dest->size + size))) {
3150 memcpy(merge_dest->slot + merge_dest->size, data, size);
3151 merge_dest->size += size;
3152
3153 /* TODO: We *could* do an invalidate + upload here if we detect that
3154 * the merged subdata call overwrites the entire buffer. However, that's
3155 * a little complicated since we can't add further calls to our batch
3156 * until we have removed the merged subdata call, which means that
3157 * calling tc_invalidate_buffer before we have removed the call will
3158 * blow things up.
3159 *
3160 * Just leave a large, merged subdata call in the batch for now, which is
3161 * at least better than tons of tiny subdata calls.
3162 */
3163
3164 return;
3165 }
3166 }
3167
3168 /* The upload is small. Enqueue it. */
3169 struct tc_buffer_subdata *p =
3170 tc_add_slot_based_call(tc, TC_CALL_buffer_subdata, tc_buffer_subdata, size);
3171
3172 tc_set_resource_reference(&p->resource, resource);
3173 /* This is will always be busy because if it wasn't, tc_improve_map_buffer-
3174 * _flags would set UNSYNCHRONIZED and we wouldn't get here.
3175 */
3176 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], resource);
3177 p->usage = usage;
3178 p->offset = offset;
3179 p->size = size;
3180 memcpy(p->slot, data, size);
3181
3182 tc_mark_call_mergeable(tc, &p->base);
3183 }
3184
3185 struct tc_texture_subdata {
3186 struct tc_call_base base;
3187 unsigned level, usage, stride;
3188 struct pipe_box box;
3189 struct pipe_resource *resource;
3190 uintptr_t layer_stride;
3191 char slot[0]; /* more will be allocated if needed */
3192 };
3193
3194 static uint16_t
tc_call_texture_subdata(struct pipe_context * pipe,void * call)3195 tc_call_texture_subdata(struct pipe_context *pipe, void *call)
3196 {
3197 struct tc_texture_subdata *p = (struct tc_texture_subdata *)call;
3198
3199 pipe->texture_subdata(pipe, p->resource, p->level, p->usage, &p->box,
3200 p->slot, p->stride, p->layer_stride);
3201 tc_drop_resource_reference(p->resource);
3202 return p->base.num_slots;
3203 }
3204
3205 static void
tc_texture_subdata(struct pipe_context * _pipe,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,const void * data,unsigned stride,uintptr_t layer_stride)3206 tc_texture_subdata(struct pipe_context *_pipe,
3207 struct pipe_resource *resource,
3208 unsigned level, unsigned usage,
3209 const struct pipe_box *box,
3210 const void *data, unsigned stride,
3211 uintptr_t layer_stride)
3212 {
3213 struct threaded_context *tc = threaded_context(_pipe);
3214 uint64_t size;
3215
3216 assert(box->height >= 1);
3217 assert(box->depth >= 1);
3218
3219 size = (box->depth - 1) * layer_stride +
3220 (box->height - 1) * (uint64_t)stride +
3221 box->width * util_format_get_blocksize(resource->format);
3222 if (!size)
3223 return;
3224
3225 /* Small uploads can be enqueued, big uploads must sync. */
3226 if (size <= TC_MAX_SUBDATA_BYTES) {
3227 struct tc_texture_subdata *p =
3228 tc_add_slot_based_call(tc, TC_CALL_texture_subdata, tc_texture_subdata, size);
3229
3230 tc_set_resource_batch_usage(tc, resource);
3231 tc_set_resource_reference(&p->resource, resource);
3232 p->level = level;
3233 p->usage = usage;
3234 p->box = *box;
3235 p->stride = stride;
3236 p->layer_stride = layer_stride;
3237 memcpy(p->slot, data, size);
3238 } else {
3239 struct pipe_context *pipe = tc->pipe;
3240 struct threaded_resource *tres = threaded_resource(resource);
3241 unsigned unsync_usage = TC_TRANSFER_MAP_THREADED_UNSYNC | PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_WRITE;
3242 bool can_unsync = !tc_resource_batch_usage_test_busy(tc, resource) &&
3243 tc->options.is_resource_busy &&
3244 !tc->options.is_resource_busy(tc->pipe->screen, tres->latest, usage | unsync_usage);
3245
3246 if (!can_unsync && resource->usage != PIPE_USAGE_STAGING &&
3247 tc->options.parse_renderpass_info && tc->in_renderpass) {
3248 enum pipe_format format = resource->format;
3249 if (usage & PIPE_MAP_DEPTH_ONLY)
3250 format = util_format_get_depth_only(format);
3251 else if (usage & PIPE_MAP_STENCIL_ONLY)
3252 format = PIPE_FORMAT_S8_UINT;
3253
3254 unsigned fmt_stride = util_format_get_stride(format, box->width);
3255 uint64_t fmt_layer_stride = util_format_get_2d_size(format, stride, box->height);
3256 assert(fmt_layer_stride * box->depth <= UINT32_MAX);
3257
3258 struct pipe_resource *pres = pipe_buffer_create(pipe->screen, 0, PIPE_USAGE_STREAM, layer_stride * box->depth);
3259 pipe->buffer_subdata(pipe, pres, unsync_usage, 0, layer_stride * box->depth, data);
3260 struct pipe_box src_box = *box;
3261 src_box.x = src_box.y = src_box.z = 0;
3262
3263 if (fmt_stride == stride && fmt_layer_stride == layer_stride) {
3264 /* if stride matches, single copy is fine*/
3265 tc->base.resource_copy_region(&tc->base, resource, level, box->x, box->y, box->z, pres, 0, &src_box);
3266 } else {
3267 /* if stride doesn't match, inline util_copy_box on the GPU and assume the driver will optimize */
3268 src_box.depth = 1;
3269 for (unsigned z = 0; z < box->depth; ++z, src_box.x = z * layer_stride) {
3270 unsigned dst_x = box->x, dst_y = box->y, width = box->width, height = box->height, dst_z = box->z + z;
3271 int blocksize = util_format_get_blocksize(format);
3272 int blockwidth = util_format_get_blockwidth(format);
3273 int blockheight = util_format_get_blockheight(format);
3274
3275 assert(blocksize > 0);
3276 assert(blockwidth > 0);
3277 assert(blockheight > 0);
3278
3279 dst_x /= blockwidth;
3280 dst_y /= blockheight;
3281 width = DIV_ROUND_UP(width, blockwidth);
3282 height = DIV_ROUND_UP(height, blockheight);
3283
3284 width *= blocksize;
3285
3286 if (width == fmt_stride && width == (unsigned)stride) {
3287 ASSERTED uint64_t size = (uint64_t)height * width;
3288
3289 assert(size <= SIZE_MAX);
3290 assert(dst_x + src_box.width < u_minify(pres->width0, level));
3291 assert(dst_y + src_box.height < u_minify(pres->height0, level));
3292 assert(pres->target != PIPE_TEXTURE_3D || z + src_box.depth < u_minify(pres->depth0, level));
3293 tc->base.resource_copy_region(&tc->base, resource, level, dst_x, dst_y, dst_z, pres, 0, &src_box);
3294 } else {
3295 src_box.height = 1;
3296 for (unsigned i = 0; i < height; i++, dst_y++, src_box.x += stride)
3297 tc->base.resource_copy_region(&tc->base, resource, level, dst_x, dst_y, dst_z, pres, 0, &src_box);
3298 }
3299 }
3300 }
3301
3302 pipe_resource_reference(&pres, NULL);
3303 } else {
3304 if (can_unsync) {
3305 usage |= unsync_usage;
3306 } else {
3307 tc_sync(tc);
3308 tc_set_driver_thread(tc);
3309 }
3310 pipe->texture_subdata(pipe, resource, level, usage, box, data,
3311 stride, layer_stride);
3312 if (!can_unsync)
3313 tc_clear_driver_thread(tc);
3314 }
3315 }
3316 }
3317
3318
3319 /********************************************************************
3320 * miscellaneous
3321 */
3322
3323 #define TC_FUNC_SYNC_RET0(ret_type, func) \
3324 static ret_type \
3325 tc_##func(struct pipe_context *_pipe) \
3326 { \
3327 struct threaded_context *tc = threaded_context(_pipe); \
3328 struct pipe_context *pipe = tc->pipe; \
3329 tc_sync(tc); \
3330 return pipe->func(pipe); \
3331 }
3332
TC_FUNC_SYNC_RET0(uint64_t,get_timestamp)3333 TC_FUNC_SYNC_RET0(uint64_t, get_timestamp)
3334
3335 static void
3336 tc_get_sample_position(struct pipe_context *_pipe,
3337 unsigned sample_count, unsigned sample_index,
3338 float *out_value)
3339 {
3340 struct threaded_context *tc = threaded_context(_pipe);
3341 struct pipe_context *pipe = tc->pipe;
3342
3343 pipe->get_sample_position(pipe, sample_count, sample_index,
3344 out_value);
3345 }
3346
3347 static enum pipe_reset_status
tc_get_device_reset_status(struct pipe_context * _pipe)3348 tc_get_device_reset_status(struct pipe_context *_pipe)
3349 {
3350 struct threaded_context *tc = threaded_context(_pipe);
3351 struct pipe_context *pipe = tc->pipe;
3352
3353 if (!tc->options.unsynchronized_get_device_reset_status)
3354 tc_sync(tc);
3355
3356 return pipe->get_device_reset_status(pipe);
3357 }
3358
3359 static void
tc_set_device_reset_callback(struct pipe_context * _pipe,const struct pipe_device_reset_callback * cb)3360 tc_set_device_reset_callback(struct pipe_context *_pipe,
3361 const struct pipe_device_reset_callback *cb)
3362 {
3363 struct threaded_context *tc = threaded_context(_pipe);
3364 struct pipe_context *pipe = tc->pipe;
3365
3366 tc_sync(tc);
3367 pipe->set_device_reset_callback(pipe, cb);
3368 }
3369
3370 struct tc_string_marker {
3371 struct tc_call_base base;
3372 int len;
3373 char slot[0]; /* more will be allocated if needed */
3374 };
3375
3376 static uint16_t
tc_call_emit_string_marker(struct pipe_context * pipe,void * call)3377 tc_call_emit_string_marker(struct pipe_context *pipe, void *call)
3378 {
3379 struct tc_string_marker *p = (struct tc_string_marker *)call;
3380 pipe->emit_string_marker(pipe, p->slot, p->len);
3381 return p->base.num_slots;
3382 }
3383
3384 static void
tc_emit_string_marker(struct pipe_context * _pipe,const char * string,int len)3385 tc_emit_string_marker(struct pipe_context *_pipe,
3386 const char *string, int len)
3387 {
3388 struct threaded_context *tc = threaded_context(_pipe);
3389
3390 if (len <= TC_MAX_STRING_MARKER_BYTES) {
3391 struct tc_string_marker *p =
3392 tc_add_slot_based_call(tc, TC_CALL_emit_string_marker, tc_string_marker, len);
3393
3394 memcpy(p->slot, string, len);
3395 p->len = len;
3396 } else {
3397 struct pipe_context *pipe = tc->pipe;
3398
3399 tc_sync(tc);
3400 tc_set_driver_thread(tc);
3401 pipe->emit_string_marker(pipe, string, len);
3402 tc_clear_driver_thread(tc);
3403 }
3404 }
3405
3406 static void
tc_dump_debug_state(struct pipe_context * _pipe,FILE * stream,unsigned flags)3407 tc_dump_debug_state(struct pipe_context *_pipe, FILE *stream,
3408 unsigned flags)
3409 {
3410 struct threaded_context *tc = threaded_context(_pipe);
3411 struct pipe_context *pipe = tc->pipe;
3412
3413 tc_sync(tc);
3414 pipe->dump_debug_state(pipe, stream, flags);
3415 }
3416
3417 static void
tc_set_debug_callback(struct pipe_context * _pipe,const struct util_debug_callback * cb)3418 tc_set_debug_callback(struct pipe_context *_pipe,
3419 const struct util_debug_callback *cb)
3420 {
3421 struct threaded_context *tc = threaded_context(_pipe);
3422 struct pipe_context *pipe = tc->pipe;
3423
3424 tc_sync(tc);
3425
3426 /* Drop all synchronous debug callbacks. Drivers are expected to be OK
3427 * with this. shader-db will use an environment variable to disable
3428 * the threaded context.
3429 */
3430 if (cb && !cb->async)
3431 pipe->set_debug_callback(pipe, NULL);
3432 else
3433 pipe->set_debug_callback(pipe, cb);
3434 }
3435
3436 static void
tc_set_log_context(struct pipe_context * _pipe,struct u_log_context * log)3437 tc_set_log_context(struct pipe_context *_pipe, struct u_log_context *log)
3438 {
3439 struct threaded_context *tc = threaded_context(_pipe);
3440 struct pipe_context *pipe = tc->pipe;
3441
3442 tc_sync(tc);
3443 pipe->set_log_context(pipe, log);
3444 }
3445
3446 static void
tc_create_fence_fd(struct pipe_context * _pipe,struct pipe_fence_handle ** fence,int fd,enum pipe_fd_type type)3447 tc_create_fence_fd(struct pipe_context *_pipe,
3448 struct pipe_fence_handle **fence, int fd,
3449 enum pipe_fd_type type)
3450 {
3451 struct threaded_context *tc = threaded_context(_pipe);
3452 struct pipe_context *pipe = tc->pipe;
3453
3454 if (!tc->options.unsynchronized_create_fence_fd)
3455 tc_sync(tc);
3456
3457 pipe->create_fence_fd(pipe, fence, fd, type);
3458 }
3459
3460 struct tc_fence_call {
3461 struct tc_call_base base;
3462 struct pipe_fence_handle *fence;
3463 };
3464
3465 static uint16_t
tc_call_fence_server_sync(struct pipe_context * pipe,void * call)3466 tc_call_fence_server_sync(struct pipe_context *pipe, void *call)
3467 {
3468 struct pipe_fence_handle *fence = to_call(call, tc_fence_call)->fence;
3469
3470 pipe->fence_server_sync(pipe, fence);
3471 pipe->screen->fence_reference(pipe->screen, &fence, NULL);
3472 return call_size(tc_fence_call);
3473 }
3474
3475 static void
tc_fence_server_sync(struct pipe_context * _pipe,struct pipe_fence_handle * fence)3476 tc_fence_server_sync(struct pipe_context *_pipe,
3477 struct pipe_fence_handle *fence)
3478 {
3479 struct threaded_context *tc = threaded_context(_pipe);
3480 struct pipe_screen *screen = tc->pipe->screen;
3481 struct tc_fence_call *call = tc_add_call(tc, TC_CALL_fence_server_sync,
3482 tc_fence_call);
3483
3484 call->fence = NULL;
3485 screen->fence_reference(screen, &call->fence, fence);
3486 }
3487
3488 static void
tc_fence_server_signal(struct pipe_context * _pipe,struct pipe_fence_handle * fence)3489 tc_fence_server_signal(struct pipe_context *_pipe,
3490 struct pipe_fence_handle *fence)
3491 {
3492 struct threaded_context *tc = threaded_context(_pipe);
3493 struct pipe_context *pipe = tc->pipe;
3494 tc_sync(tc);
3495 pipe->fence_server_signal(pipe, fence);
3496 }
3497
3498 static struct pipe_video_codec *
tc_create_video_codec(UNUSED struct pipe_context * _pipe,UNUSED const struct pipe_video_codec * templ)3499 tc_create_video_codec(UNUSED struct pipe_context *_pipe,
3500 UNUSED const struct pipe_video_codec *templ)
3501 {
3502 unreachable("Threaded context should not be enabled for video APIs");
3503 return NULL;
3504 }
3505
3506 static struct pipe_video_buffer *
tc_create_video_buffer(UNUSED struct pipe_context * _pipe,UNUSED const struct pipe_video_buffer * templ)3507 tc_create_video_buffer(UNUSED struct pipe_context *_pipe,
3508 UNUSED const struct pipe_video_buffer *templ)
3509 {
3510 unreachable("Threaded context should not be enabled for video APIs");
3511 return NULL;
3512 }
3513
3514 struct tc_context_param {
3515 struct tc_call_base base;
3516 enum pipe_context_param param;
3517 unsigned value;
3518 };
3519
3520 static uint16_t
tc_call_set_context_param(struct pipe_context * pipe,void * call)3521 tc_call_set_context_param(struct pipe_context *pipe, void *call)
3522 {
3523 struct tc_context_param *p = to_call(call, tc_context_param);
3524
3525 if (pipe->set_context_param)
3526 pipe->set_context_param(pipe, p->param, p->value);
3527
3528 return call_size(tc_context_param);
3529 }
3530
3531 static void
tc_set_context_param(struct pipe_context * _pipe,enum pipe_context_param param,unsigned value)3532 tc_set_context_param(struct pipe_context *_pipe,
3533 enum pipe_context_param param,
3534 unsigned value)
3535 {
3536 struct threaded_context *tc = threaded_context(_pipe);
3537
3538 if (param == PIPE_CONTEXT_PARAM_UPDATE_THREAD_SCHEDULING) {
3539 util_thread_sched_apply_policy(tc->queue.threads[0],
3540 UTIL_THREAD_THREADED_CONTEXT, value,
3541 NULL);
3542
3543 /* Execute this immediately (without enqueuing).
3544 * It's required to be thread-safe.
3545 */
3546 struct pipe_context *pipe = tc->pipe;
3547 if (pipe->set_context_param)
3548 pipe->set_context_param(pipe, param, value);
3549 return;
3550 }
3551
3552 if (tc->pipe->set_context_param) {
3553 struct tc_context_param *call =
3554 tc_add_call(tc, TC_CALL_set_context_param, tc_context_param);
3555
3556 call->param = param;
3557 call->value = value;
3558 }
3559 }
3560
3561
3562 /********************************************************************
3563 * draw, launch, clear, blit, copy, flush
3564 */
3565
3566 struct tc_flush_deferred_call {
3567 struct tc_call_base base;
3568 unsigned flags;
3569 struct pipe_fence_handle *fence;
3570 };
3571
3572 struct tc_flush_call {
3573 struct tc_call_base base;
3574 unsigned flags;
3575 struct pipe_fence_handle *fence;
3576 struct threaded_context *tc;
3577 };
3578
3579 static void
tc_flush_queries(struct threaded_context * tc)3580 tc_flush_queries(struct threaded_context *tc)
3581 {
3582 struct threaded_query *tq, *tmp;
3583 LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) {
3584 list_del(&tq->head_unflushed);
3585
3586 /* Memory release semantics: due to a possible race with
3587 * tc_get_query_result, we must ensure that the linked list changes
3588 * are visible before setting tq->flushed.
3589 */
3590 p_atomic_set(&tq->flushed, true);
3591 }
3592 }
3593
3594 static uint16_t
tc_call_flush_deferred(struct pipe_context * pipe,void * call)3595 tc_call_flush_deferred(struct pipe_context *pipe, void *call)
3596 {
3597 struct tc_flush_deferred_call *p = to_call(call, tc_flush_deferred_call);
3598 struct pipe_screen *screen = pipe->screen;
3599
3600 pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags);
3601 screen->fence_reference(screen, &p->fence, NULL);
3602
3603 return call_size(tc_flush_deferred_call);
3604 }
3605
3606 static uint16_t
tc_call_flush(struct pipe_context * pipe,void * call)3607 tc_call_flush(struct pipe_context *pipe, void *call)
3608 {
3609 struct tc_flush_call *p = to_call(call, tc_flush_call);
3610 struct pipe_screen *screen = pipe->screen;
3611
3612 pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags);
3613 screen->fence_reference(screen, &p->fence, NULL);
3614
3615 tc_flush_queries(p->tc);
3616
3617 return call_size(tc_flush_call);
3618 }
3619
3620 static void
tc_flush(struct pipe_context * _pipe,struct pipe_fence_handle ** fence,unsigned flags)3621 tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
3622 unsigned flags)
3623 {
3624 struct threaded_context *tc = threaded_context(_pipe);
3625 struct pipe_context *pipe = tc->pipe;
3626 struct pipe_screen *screen = pipe->screen;
3627 bool async = flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC);
3628 bool deferred = (flags & PIPE_FLUSH_DEFERRED) > 0;
3629
3630 if (!deferred || !fence)
3631 tc->in_renderpass = false;
3632
3633 if (async && tc->options.create_fence) {
3634 if (fence) {
3635 struct tc_batch *next = &tc->batch_slots[tc->next];
3636
3637 if (!next->token) {
3638 next->token = malloc(sizeof(*next->token));
3639 if (!next->token)
3640 goto out_of_memory;
3641
3642 pipe_reference_init(&next->token->ref, 1);
3643 next->token->tc = tc;
3644 }
3645
3646 screen->fence_reference(screen, fence,
3647 tc->options.create_fence(pipe, next->token));
3648 if (!*fence)
3649 goto out_of_memory;
3650 }
3651
3652 struct tc_flush_call *p;
3653 if (deferred) {
3654 /* these have identical fields */
3655 p = (struct tc_flush_call *)tc_add_call(tc, TC_CALL_flush_deferred, tc_flush_deferred_call);
3656 } else {
3657 p = tc_add_call(tc, TC_CALL_flush, tc_flush_call);
3658 p->tc = tc;
3659 }
3660 p->fence = fence ? *fence : NULL;
3661 p->flags = flags | TC_FLUSH_ASYNC;
3662
3663 if (!deferred) {
3664 /* non-deferred async flushes indicate completion of existing renderpass info */
3665 tc_signal_renderpass_info_ready(tc);
3666 tc_batch_flush(tc, false);
3667 tc->seen_fb_state = false;
3668 }
3669
3670 return;
3671 }
3672
3673 out_of_memory:
3674 tc->flushing = true;
3675 /* renderpass info is signaled during sync */
3676 tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
3677 flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
3678
3679 if (!deferred) {
3680 tc_flush_queries(tc);
3681 tc->seen_fb_state = false;
3682 tc->query_ended = false;
3683 }
3684 tc_set_driver_thread(tc);
3685 pipe->flush(pipe, fence, flags);
3686 tc_clear_driver_thread(tc);
3687 tc->flushing = false;
3688 }
3689
3690 struct tc_draw_single_drawid {
3691 struct tc_draw_single base;
3692 unsigned drawid_offset;
3693 };
3694
3695 static uint16_t
tc_call_draw_single_drawid(struct pipe_context * pipe,void * call)3696 tc_call_draw_single_drawid(struct pipe_context *pipe, void *call)
3697 {
3698 struct tc_draw_single_drawid *info_drawid = to_call(call, tc_draw_single_drawid);
3699 struct tc_draw_single *info = &info_drawid->base;
3700
3701 /* u_threaded_context stores start/count in min/max_index for single draws. */
3702 /* Drivers using u_threaded_context shouldn't use min/max_index. */
3703 struct pipe_draw_start_count_bias draw;
3704
3705 draw.start = info->info.min_index;
3706 draw.count = info->info.max_index;
3707 draw.index_bias = info->index_bias;
3708
3709 info->info.index_bounds_valid = false;
3710 info->info.has_user_indices = false;
3711 info->info.take_index_buffer_ownership = false;
3712
3713 pipe->draw_vbo(pipe, &info->info, info_drawid->drawid_offset, NULL, &draw, 1);
3714 if (info->info.index_size)
3715 tc_drop_resource_reference(info->info.index.resource);
3716
3717 return call_size(tc_draw_single_drawid);
3718 }
3719
3720 static void
simplify_draw_info(struct pipe_draw_info * info)3721 simplify_draw_info(struct pipe_draw_info *info)
3722 {
3723 /* Clear these fields to facilitate draw merging.
3724 * Drivers shouldn't use them.
3725 */
3726 info->has_user_indices = false;
3727 info->index_bounds_valid = false;
3728 info->take_index_buffer_ownership = false;
3729 info->index_bias_varies = false;
3730 info->_pad = 0;
3731
3732 /* This shouldn't be set when merging single draws. */
3733 info->increment_draw_id = false;
3734
3735 if (info->index_size) {
3736 if (!info->primitive_restart)
3737 info->restart_index = 0;
3738 } else {
3739 assert(!info->primitive_restart);
3740 info->primitive_restart = false;
3741 info->restart_index = 0;
3742 info->index.resource = NULL;
3743 }
3744 }
3745
3746 static bool
is_next_call_a_mergeable_draw(struct tc_draw_single * first,struct tc_draw_single * next)3747 is_next_call_a_mergeable_draw(struct tc_draw_single *first,
3748 struct tc_draw_single *next)
3749 {
3750 if (next->base.call_id != TC_CALL_draw_single)
3751 return false;
3752
3753 STATIC_ASSERT(offsetof(struct pipe_draw_info, min_index) ==
3754 sizeof(struct pipe_draw_info) - 8);
3755 STATIC_ASSERT(offsetof(struct pipe_draw_info, max_index) ==
3756 sizeof(struct pipe_draw_info) - 4);
3757 /* All fields must be the same except start and count. */
3758 /* u_threaded_context stores start/count in min/max_index for single draws. */
3759 return memcmp((uint32_t*)&first->info, (uint32_t*)&next->info,
3760 DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX) == 0;
3761 }
3762
3763 static uint16_t
tc_call_draw_single(struct pipe_context * pipe,void * call)3764 tc_call_draw_single(struct pipe_context *pipe, void *call)
3765 {
3766 /* Draw call merging. */
3767 struct tc_draw_single *first = to_call(call, tc_draw_single);
3768 struct tc_draw_single *next = get_next_call(first, tc_draw_single);
3769
3770 /* If at least 2 consecutive draw calls can be merged... */
3771 if (next->base.call_id == TC_CALL_draw_single) {
3772 if (is_next_call_a_mergeable_draw(first, next)) {
3773 /* The maximum number of merged draws is given by the batch size. */
3774 struct pipe_draw_start_count_bias multi[TC_SLOTS_PER_BATCH / call_size(tc_draw_single)];
3775 unsigned num_draws = 2;
3776 bool index_bias_varies = first->index_bias != next->index_bias;
3777
3778 /* u_threaded_context stores start/count in min/max_index for single draws. */
3779 multi[0].start = first->info.min_index;
3780 multi[0].count = first->info.max_index;
3781 multi[0].index_bias = first->index_bias;
3782 multi[1].start = next->info.min_index;
3783 multi[1].count = next->info.max_index;
3784 multi[1].index_bias = next->index_bias;
3785
3786 /* Find how many other draws can be merged. */
3787 next = get_next_call(next, tc_draw_single);
3788 for (; is_next_call_a_mergeable_draw(first, next);
3789 next = get_next_call(next, tc_draw_single), num_draws++) {
3790 /* u_threaded_context stores start/count in min/max_index for single draws. */
3791 multi[num_draws].start = next->info.min_index;
3792 multi[num_draws].count = next->info.max_index;
3793 multi[num_draws].index_bias = next->index_bias;
3794 index_bias_varies |= first->index_bias != next->index_bias;
3795 }
3796
3797 first->info.index_bias_varies = index_bias_varies;
3798 pipe->draw_vbo(pipe, &first->info, 0, NULL, multi, num_draws);
3799
3800 /* Since all draws use the same index buffer, drop all references at once. */
3801 if (first->info.index_size)
3802 pipe_drop_resource_references(first->info.index.resource, num_draws);
3803
3804 return call_size(tc_draw_single) * num_draws;
3805 }
3806 }
3807
3808 /* u_threaded_context stores start/count in min/max_index for single draws. */
3809 /* Drivers using u_threaded_context shouldn't use min/max_index. */
3810 struct pipe_draw_start_count_bias draw;
3811
3812 draw.start = first->info.min_index;
3813 draw.count = first->info.max_index;
3814 draw.index_bias = first->index_bias;
3815
3816 first->info.index_bounds_valid = false;
3817 first->info.has_user_indices = false;
3818 first->info.take_index_buffer_ownership = false;
3819
3820 pipe->draw_vbo(pipe, &first->info, 0, NULL, &draw, 1);
3821 if (first->info.index_size)
3822 tc_drop_resource_reference(first->info.index.resource);
3823
3824 return call_size(tc_draw_single);
3825 }
3826
3827 struct tc_draw_indirect {
3828 struct tc_call_base base;
3829 struct pipe_draw_start_count_bias draw;
3830 struct pipe_draw_info info;
3831 struct pipe_draw_indirect_info indirect;
3832 };
3833
3834 static uint16_t
tc_call_draw_indirect(struct pipe_context * pipe,void * call)3835 tc_call_draw_indirect(struct pipe_context *pipe, void *call)
3836 {
3837 struct tc_draw_indirect *info = to_call(call, tc_draw_indirect);
3838
3839 info->info.index_bounds_valid = false;
3840 info->info.take_index_buffer_ownership = false;
3841
3842 pipe->draw_vbo(pipe, &info->info, 0, &info->indirect, &info->draw, 1);
3843 if (info->info.index_size)
3844 tc_drop_resource_reference(info->info.index.resource);
3845
3846 tc_drop_resource_reference(info->indirect.buffer);
3847 tc_drop_resource_reference(info->indirect.indirect_draw_count);
3848 tc_drop_so_target_reference(info->indirect.count_from_stream_output);
3849 return call_size(tc_draw_indirect);
3850 }
3851
3852 struct tc_draw_multi {
3853 struct tc_call_base base;
3854 unsigned num_draws;
3855 struct pipe_draw_info info;
3856 struct pipe_draw_start_count_bias slot[]; /* variable-sized array */
3857 };
3858
3859 static uint16_t
tc_call_draw_multi(struct pipe_context * pipe,void * call)3860 tc_call_draw_multi(struct pipe_context *pipe, void *call)
3861 {
3862 struct tc_draw_multi *info = (struct tc_draw_multi*)call;
3863
3864 info->info.has_user_indices = false;
3865 info->info.index_bounds_valid = false;
3866 info->info.take_index_buffer_ownership = false;
3867
3868 pipe->draw_vbo(pipe, &info->info, 0, NULL, info->slot, info->num_draws);
3869 if (info->info.index_size)
3870 tc_drop_resource_reference(info->info.index.resource);
3871
3872 return info->base.num_slots;
3873 }
3874
3875 #define DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX \
3876 offsetof(struct pipe_draw_info, index)
3877
3878 /* Single draw with drawid_offset == 0. */
3879 static void
tc_draw_single(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3880 tc_draw_single(struct pipe_context *_pipe, const struct pipe_draw_info *info,
3881 unsigned drawid_offset,
3882 const struct pipe_draw_indirect_info *indirect,
3883 const struct pipe_draw_start_count_bias *draws,
3884 unsigned num_draws)
3885 {
3886 struct threaded_context *tc = threaded_context(_pipe);
3887 struct tc_draw_single *p =
3888 tc_add_call(tc, TC_CALL_draw_single, tc_draw_single);
3889
3890 if (info->index_size) {
3891 if (!info->take_index_buffer_ownership) {
3892 tc_set_resource_reference(&p->info.index.resource,
3893 info->index.resource);
3894 }
3895 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], info->index.resource);
3896 }
3897 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
3898 /* u_threaded_context stores start/count in min/max_index for single draws. */
3899 p->info.min_index = draws[0].start;
3900 p->info.max_index = draws[0].count;
3901 p->index_bias = draws[0].index_bias;
3902 simplify_draw_info(&p->info);
3903 }
3904
3905 /* Single draw with drawid_offset > 0. */
3906 static void
tc_draw_single_draw_id(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3907 tc_draw_single_draw_id(struct pipe_context *_pipe,
3908 const struct pipe_draw_info *info,
3909 unsigned drawid_offset,
3910 const struct pipe_draw_indirect_info *indirect,
3911 const struct pipe_draw_start_count_bias *draws,
3912 unsigned num_draws)
3913 {
3914 struct threaded_context *tc = threaded_context(_pipe);
3915 struct tc_draw_single *p =
3916 &tc_add_call(tc, TC_CALL_draw_single_drawid, tc_draw_single_drawid)->base;
3917
3918 if (info->index_size) {
3919 if (!info->take_index_buffer_ownership) {
3920 tc_set_resource_reference(&p->info.index.resource,
3921 info->index.resource);
3922 }
3923 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], info->index.resource);
3924 }
3925 ((struct tc_draw_single_drawid*)p)->drawid_offset = drawid_offset;
3926 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
3927 /* u_threaded_context stores start/count in min/max_index for single draws. */
3928 p->info.min_index = draws[0].start;
3929 p->info.max_index = draws[0].count;
3930 p->index_bias = draws[0].index_bias;
3931 simplify_draw_info(&p->info);
3932 }
3933
3934 /* Single draw with user indices and drawid_offset == 0. */
3935 static void
tc_draw_user_indices_single(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3936 tc_draw_user_indices_single(struct pipe_context *_pipe,
3937 const struct pipe_draw_info *info,
3938 unsigned drawid_offset,
3939 const struct pipe_draw_indirect_info *indirect,
3940 const struct pipe_draw_start_count_bias *draws,
3941 unsigned num_draws)
3942 {
3943 struct threaded_context *tc = threaded_context(_pipe);
3944 unsigned index_size = info->index_size;
3945 unsigned size = draws[0].count * index_size;
3946 struct pipe_resource *buffer = NULL;
3947 unsigned offset;
3948
3949 if (!size)
3950 return;
3951
3952 /* This must be done before adding draw_vbo, because it could generate
3953 * e.g. transfer_unmap and flush partially-uninitialized draw_vbo
3954 * to the driver if it was done afterwards.
3955 */
3956 u_upload_data(tc->base.stream_uploader, 0, size, 4,
3957 (uint8_t*)info->index.user + draws[0].start * index_size,
3958 &offset, &buffer);
3959 if (unlikely(!buffer))
3960 return;
3961
3962 struct tc_draw_single *p =
3963 tc_add_call(tc, TC_CALL_draw_single, tc_draw_single);
3964 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX);
3965 p->info.index.resource = buffer;
3966 /* u_threaded_context stores start/count in min/max_index for single draws. */
3967 p->info.min_index = offset >> util_logbase2(index_size);
3968 p->info.max_index = draws[0].count;
3969 p->index_bias = draws[0].index_bias;
3970 simplify_draw_info(&p->info);
3971 }
3972
3973 /* Single draw with user indices and drawid_offset > 0. */
3974 static void
tc_draw_user_indices_single_draw_id(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3975 tc_draw_user_indices_single_draw_id(struct pipe_context *_pipe,
3976 const struct pipe_draw_info *info,
3977 unsigned drawid_offset,
3978 const struct pipe_draw_indirect_info *indirect,
3979 const struct pipe_draw_start_count_bias *draws,
3980 unsigned num_draws)
3981 {
3982 struct threaded_context *tc = threaded_context(_pipe);
3983 unsigned index_size = info->index_size;
3984 unsigned size = draws[0].count * index_size;
3985 struct pipe_resource *buffer = NULL;
3986 unsigned offset;
3987
3988 if (!size)
3989 return;
3990
3991 /* This must be done before adding draw_vbo, because it could generate
3992 * e.g. transfer_unmap and flush partially-uninitialized draw_vbo
3993 * to the driver if it was done afterwards.
3994 */
3995 u_upload_data(tc->base.stream_uploader, 0, size, 4,
3996 (uint8_t*)info->index.user + draws[0].start * index_size,
3997 &offset, &buffer);
3998 if (unlikely(!buffer))
3999 return;
4000
4001 struct tc_draw_single *p =
4002 &tc_add_call(tc, TC_CALL_draw_single_drawid, tc_draw_single_drawid)->base;
4003 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX);
4004 p->info.index.resource = buffer;
4005 ((struct tc_draw_single_drawid*)p)->drawid_offset = drawid_offset;
4006 /* u_threaded_context stores start/count in min/max_index for single draws. */
4007 p->info.min_index = offset >> util_logbase2(index_size);
4008 p->info.max_index = draws[0].count;
4009 p->index_bias = draws[0].index_bias;
4010 simplify_draw_info(&p->info);
4011 }
4012
4013 #define DRAW_OVERHEAD_BYTES sizeof(struct tc_draw_multi)
4014 #define ONE_DRAW_SLOT_BYTES sizeof(((struct tc_draw_multi*)NULL)->slot[0])
4015
4016 #define SLOTS_FOR_ONE_DRAW \
4017 DIV_ROUND_UP(DRAW_OVERHEAD_BYTES + ONE_DRAW_SLOT_BYTES, \
4018 sizeof(struct tc_call_base))
4019
4020 static void
tc_draw_multi(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4021 tc_draw_multi(struct pipe_context *_pipe, const struct pipe_draw_info *info,
4022 unsigned drawid_offset,
4023 const struct pipe_draw_indirect_info *indirect,
4024 const struct pipe_draw_start_count_bias *draws,
4025 unsigned num_draws)
4026 {
4027 struct threaded_context *tc = threaded_context(_pipe);
4028 int total_offset = 0;
4029 bool take_index_buffer_ownership = info->take_index_buffer_ownership;
4030
4031 while (num_draws) {
4032 struct tc_batch *next = &tc->batch_slots[tc->next];
4033
4034 int nb_slots_left = TC_SLOTS_PER_BATCH - 1 - next->num_total_slots;
4035 /* If there isn't enough place for one draw, try to fill the next one */
4036 if (nb_slots_left < SLOTS_FOR_ONE_DRAW)
4037 nb_slots_left = TC_SLOTS_PER_BATCH - 1;
4038 const int size_left_bytes = nb_slots_left * sizeof(struct tc_call_base);
4039
4040 /* How many draws can we fit in the current batch */
4041 const int dr = MIN2(num_draws, (size_left_bytes - DRAW_OVERHEAD_BYTES) /
4042 ONE_DRAW_SLOT_BYTES);
4043
4044 /* Non-indexed call or indexed with a real index buffer. */
4045 struct tc_draw_multi *p =
4046 tc_add_slot_based_call(tc, TC_CALL_draw_multi, tc_draw_multi,
4047 dr);
4048 if (info->index_size) {
4049 if (!take_index_buffer_ownership) {
4050 tc_set_resource_reference(&p->info.index.resource,
4051 info->index.resource);
4052 }
4053 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], info->index.resource);
4054 }
4055 take_index_buffer_ownership = false;
4056 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
4057 p->num_draws = dr;
4058 memcpy(p->slot, &draws[total_offset], sizeof(draws[0]) * dr);
4059 num_draws -= dr;
4060
4061 total_offset += dr;
4062 }
4063 }
4064
4065 static void
tc_draw_user_indices_multi(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4066 tc_draw_user_indices_multi(struct pipe_context *_pipe,
4067 const struct pipe_draw_info *info,
4068 unsigned drawid_offset,
4069 const struct pipe_draw_indirect_info *indirect,
4070 const struct pipe_draw_start_count_bias *draws,
4071 unsigned num_draws)
4072 {
4073 struct threaded_context *tc = threaded_context(_pipe);
4074 struct pipe_resource *buffer = NULL;
4075 unsigned buffer_offset, total_count = 0;
4076 unsigned index_size_shift = util_logbase2(info->index_size);
4077 uint8_t *ptr = NULL;
4078
4079 /* Get the total count. */
4080 for (unsigned i = 0; i < num_draws; i++)
4081 total_count += draws[i].count;
4082
4083 if (!total_count)
4084 return;
4085
4086 /* Allocate space for all index buffers.
4087 *
4088 * This must be done before adding draw_vbo, because it could generate
4089 * e.g. transfer_unmap and flush partially-uninitialized draw_vbo
4090 * to the driver if it was done afterwards.
4091 */
4092 u_upload_alloc(tc->base.stream_uploader, 0,
4093 total_count << index_size_shift, 4,
4094 &buffer_offset, &buffer, (void**)&ptr);
4095 if (unlikely(!buffer))
4096 return;
4097
4098 int total_offset = 0;
4099 unsigned offset = 0;
4100 while (num_draws) {
4101 struct tc_batch *next = &tc->batch_slots[tc->next];
4102
4103 int nb_slots_left = TC_SLOTS_PER_BATCH - 1 - next->num_total_slots;
4104 /* If there isn't enough place for one draw, try to fill the next one */
4105 if (nb_slots_left < SLOTS_FOR_ONE_DRAW)
4106 nb_slots_left = TC_SLOTS_PER_BATCH - 1;
4107 const int size_left_bytes = nb_slots_left * sizeof(struct tc_call_base);
4108
4109 /* How many draws can we fit in the current batch */
4110 const int dr = MIN2(num_draws, (size_left_bytes - DRAW_OVERHEAD_BYTES) /
4111 ONE_DRAW_SLOT_BYTES);
4112
4113 struct tc_draw_multi *p =
4114 tc_add_slot_based_call(tc, TC_CALL_draw_multi, tc_draw_multi,
4115 dr);
4116 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX);
4117
4118 if (total_offset == 0)
4119 /* the first slot inherits the reference from u_upload_alloc() */
4120 p->info.index.resource = buffer;
4121 else
4122 /* all following slots need a new reference */
4123 tc_set_resource_reference(&p->info.index.resource, buffer);
4124
4125 p->num_draws = dr;
4126
4127 /* Upload index buffers. */
4128 for (unsigned i = 0; i < dr; i++) {
4129 unsigned count = draws[i + total_offset].count;
4130
4131 if (!count) {
4132 p->slot[i].start = 0;
4133 p->slot[i].count = 0;
4134 p->slot[i].index_bias = 0;
4135 continue;
4136 }
4137
4138 unsigned size = count << index_size_shift;
4139 memcpy(ptr + offset,
4140 (uint8_t*)info->index.user +
4141 (draws[i + total_offset].start << index_size_shift), size);
4142 p->slot[i].start = (buffer_offset + offset) >> index_size_shift;
4143 p->slot[i].count = count;
4144 p->slot[i].index_bias = draws[i + total_offset].index_bias;
4145 offset += size;
4146 }
4147
4148 total_offset += dr;
4149 num_draws -= dr;
4150 }
4151 }
4152
4153 static void
tc_draw_indirect(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4154 tc_draw_indirect(struct pipe_context *_pipe, const struct pipe_draw_info *info,
4155 unsigned drawid_offset,
4156 const struct pipe_draw_indirect_info *indirect,
4157 const struct pipe_draw_start_count_bias *draws,
4158 unsigned num_draws)
4159 {
4160 struct threaded_context *tc = threaded_context(_pipe);
4161 assert(!info->has_user_indices);
4162 assert(num_draws == 1);
4163
4164 struct tc_draw_indirect *p =
4165 tc_add_call(tc, TC_CALL_draw_indirect, tc_draw_indirect);
4166 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
4167
4168 if (info->index_size) {
4169 if (!info->take_index_buffer_ownership) {
4170 tc_set_resource_reference(&p->info.index.resource,
4171 info->index.resource);
4172 }
4173 tc_add_to_buffer_list(next, info->index.resource);
4174 }
4175 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
4176
4177 tc_set_resource_reference(&p->indirect.buffer, indirect->buffer);
4178 tc_set_resource_reference(&p->indirect.indirect_draw_count,
4179 indirect->indirect_draw_count);
4180 p->indirect.count_from_stream_output = NULL;
4181 pipe_so_target_reference(&p->indirect.count_from_stream_output,
4182 indirect->count_from_stream_output);
4183
4184 if (indirect->buffer)
4185 tc_add_to_buffer_list(next, indirect->buffer);
4186 if (indirect->indirect_draw_count)
4187 tc_add_to_buffer_list(next, indirect->indirect_draw_count);
4188 if (indirect->count_from_stream_output)
4189 tc_add_to_buffer_list(next, indirect->count_from_stream_output->buffer);
4190
4191 memcpy(&p->indirect, indirect, sizeof(*indirect));
4192 p->draw.start = draws[0].start;
4193 }
4194
4195 /* Dispatch table for tc_draw_vbo:
4196 *
4197 * Indexed by:
4198 * [is_indirect * 8 + index_size_and_has_user_indices * 4 +
4199 * is_multi_draw * 2 + non_zero_draw_id]
4200 */
4201 static pipe_draw_func draw_funcs[16] = {
4202 tc_draw_single,
4203 tc_draw_single_draw_id,
4204 tc_draw_multi,
4205 tc_draw_multi,
4206 tc_draw_user_indices_single,
4207 tc_draw_user_indices_single_draw_id,
4208 tc_draw_user_indices_multi,
4209 tc_draw_user_indices_multi,
4210 tc_draw_indirect,
4211 tc_draw_indirect,
4212 tc_draw_indirect,
4213 tc_draw_indirect,
4214 tc_draw_indirect,
4215 tc_draw_indirect,
4216 tc_draw_indirect,
4217 tc_draw_indirect,
4218 };
4219
4220 void
tc_draw_vbo(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4221 tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
4222 unsigned drawid_offset,
4223 const struct pipe_draw_indirect_info *indirect,
4224 const struct pipe_draw_start_count_bias *draws,
4225 unsigned num_draws)
4226 {
4227 STATIC_ASSERT(DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX +
4228 sizeof(intptr_t) == offsetof(struct pipe_draw_info, min_index));
4229
4230 struct threaded_context *tc = threaded_context(_pipe);
4231 if (tc->options.parse_renderpass_info)
4232 tc_parse_draw(tc);
4233
4234 /* Use a function table to call the desired variant of draw_vbo. */
4235 unsigned index = (indirect != NULL) * 8 +
4236 (info->index_size && info->has_user_indices) * 4 +
4237 (num_draws > 1) * 2 + (drawid_offset != 0);
4238 draw_funcs[index](_pipe, info, drawid_offset, indirect, draws, num_draws);
4239
4240 /* This must be after tc_add_*call, which can flush the batch. */
4241 if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
4242 tc_add_all_gfx_bindings_to_buffer_list(tc);
4243 }
4244
4245 struct tc_draw_single *
tc_add_draw_single_call(struct pipe_context * _pipe,struct pipe_resource * index_bo)4246 tc_add_draw_single_call(struct pipe_context *_pipe,
4247 struct pipe_resource *index_bo)
4248 {
4249 struct threaded_context *tc = threaded_context(_pipe);
4250
4251 if (tc->options.parse_renderpass_info)
4252 tc_parse_draw(tc);
4253
4254 struct tc_draw_single *p =
4255 tc_add_call(tc, TC_CALL_draw_single, tc_draw_single);
4256
4257 if (index_bo)
4258 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], index_bo);
4259
4260 /* This must be after tc_add_*call, which can flush the batch. */
4261 if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
4262 tc_add_all_gfx_bindings_to_buffer_list(tc);
4263
4264 return p;
4265 }
4266
4267 struct tc_draw_vstate_single {
4268 struct tc_call_base base;
4269 struct pipe_draw_start_count_bias draw;
4270
4271 /* The following states must be together without holes because they are
4272 * compared by draw merging.
4273 */
4274 struct pipe_vertex_state *state;
4275 uint32_t partial_velem_mask;
4276 struct pipe_draw_vertex_state_info info;
4277 };
4278
4279 static bool
is_next_call_a_mergeable_draw_vstate(struct tc_draw_vstate_single * first,struct tc_draw_vstate_single * next)4280 is_next_call_a_mergeable_draw_vstate(struct tc_draw_vstate_single *first,
4281 struct tc_draw_vstate_single *next)
4282 {
4283 if (next->base.call_id != TC_CALL_draw_vstate_single)
4284 return false;
4285
4286 return !memcmp(&first->state, &next->state,
4287 offsetof(struct tc_draw_vstate_single, info) +
4288 sizeof(struct pipe_draw_vertex_state_info) -
4289 offsetof(struct tc_draw_vstate_single, state));
4290 }
4291
4292 static uint16_t
tc_call_draw_vstate_single(struct pipe_context * pipe,void * call)4293 tc_call_draw_vstate_single(struct pipe_context *pipe, void *call)
4294 {
4295 /* Draw call merging. */
4296 struct tc_draw_vstate_single *first = to_call(call, tc_draw_vstate_single);
4297 struct tc_draw_vstate_single *next = get_next_call(first, tc_draw_vstate_single);
4298
4299 /* If at least 2 consecutive draw calls can be merged... */
4300 if (is_next_call_a_mergeable_draw_vstate(first, next)) {
4301 /* The maximum number of merged draws is given by the batch size. */
4302 struct pipe_draw_start_count_bias draws[TC_SLOTS_PER_BATCH /
4303 call_size(tc_draw_vstate_single)];
4304 unsigned num_draws = 2;
4305
4306 draws[0] = first->draw;
4307 draws[1] = next->draw;
4308
4309 /* Find how many other draws can be merged. */
4310 next = get_next_call(next, tc_draw_vstate_single);
4311 for (; is_next_call_a_mergeable_draw_vstate(first, next);
4312 next = get_next_call(next, tc_draw_vstate_single),
4313 num_draws++)
4314 draws[num_draws] = next->draw;
4315
4316 pipe->draw_vertex_state(pipe, first->state, first->partial_velem_mask,
4317 first->info, draws, num_draws);
4318 /* Since all draws use the same state, drop all references at once. */
4319 tc_drop_vertex_state_references(first->state, num_draws);
4320
4321 return call_size(tc_draw_vstate_single) * num_draws;
4322 }
4323
4324 pipe->draw_vertex_state(pipe, first->state, first->partial_velem_mask,
4325 first->info, &first->draw, 1);
4326 tc_drop_vertex_state_references(first->state, 1);
4327 return call_size(tc_draw_vstate_single);
4328 }
4329
4330 struct tc_draw_vstate_multi {
4331 struct tc_call_base base;
4332 uint32_t partial_velem_mask;
4333 struct pipe_draw_vertex_state_info info;
4334 unsigned num_draws;
4335 struct pipe_vertex_state *state;
4336 struct pipe_draw_start_count_bias slot[0];
4337 };
4338
4339 static uint16_t
tc_call_draw_vstate_multi(struct pipe_context * pipe,void * call)4340 tc_call_draw_vstate_multi(struct pipe_context *pipe, void *call)
4341 {
4342 struct tc_draw_vstate_multi *info = (struct tc_draw_vstate_multi*)call;
4343
4344 pipe->draw_vertex_state(pipe, info->state, info->partial_velem_mask,
4345 info->info, info->slot, info->num_draws);
4346 tc_drop_vertex_state_references(info->state, 1);
4347 return info->base.num_slots;
4348 }
4349
4350 static void
tc_draw_vertex_state(struct pipe_context * _pipe,struct pipe_vertex_state * state,uint32_t partial_velem_mask,struct pipe_draw_vertex_state_info info,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4351 tc_draw_vertex_state(struct pipe_context *_pipe,
4352 struct pipe_vertex_state *state,
4353 uint32_t partial_velem_mask,
4354 struct pipe_draw_vertex_state_info info,
4355 const struct pipe_draw_start_count_bias *draws,
4356 unsigned num_draws)
4357 {
4358 struct threaded_context *tc = threaded_context(_pipe);
4359 if (tc->options.parse_renderpass_info)
4360 tc_parse_draw(tc);
4361
4362 if (num_draws == 1) {
4363 /* Single draw. */
4364 struct tc_draw_vstate_single *p =
4365 tc_add_call(tc, TC_CALL_draw_vstate_single, tc_draw_vstate_single);
4366 p->partial_velem_mask = partial_velem_mask;
4367 p->draw = draws[0];
4368 p->info.mode = info.mode;
4369 p->info.take_vertex_state_ownership = false;
4370
4371 /* This should be always 0 for simplicity because we assume that
4372 * index_bias doesn't vary.
4373 */
4374 assert(draws[0].index_bias == 0);
4375
4376 if (!info.take_vertex_state_ownership)
4377 tc_set_vertex_state_reference(&p->state, state);
4378 else
4379 p->state = state;
4380
4381
4382 /* This must be after tc_add_*call, which can flush the batch. */
4383 if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
4384 tc_add_all_gfx_bindings_to_buffer_list(tc);
4385 return;
4386 }
4387
4388 const int draw_overhead_bytes = sizeof(struct tc_draw_vstate_multi);
4389 const int one_draw_slot_bytes = sizeof(((struct tc_draw_vstate_multi*)NULL)->slot[0]);
4390 const int slots_for_one_draw = DIV_ROUND_UP(draw_overhead_bytes + one_draw_slot_bytes,
4391 sizeof(struct tc_call_base));
4392 /* Multi draw. */
4393 int total_offset = 0;
4394 bool take_vertex_state_ownership = info.take_vertex_state_ownership;
4395 while (num_draws) {
4396 struct tc_batch *next = &tc->batch_slots[tc->next];
4397
4398 int nb_slots_left = TC_SLOTS_PER_BATCH - 1 - next->num_total_slots;
4399 /* If there isn't enough place for one draw, try to fill the next one */
4400 if (nb_slots_left < slots_for_one_draw)
4401 nb_slots_left = TC_SLOTS_PER_BATCH - 1;
4402 const int size_left_bytes = nb_slots_left * sizeof(struct tc_call_base);
4403
4404 /* How many draws can we fit in the current batch */
4405 const int dr = MIN2(num_draws, (size_left_bytes - draw_overhead_bytes) / one_draw_slot_bytes);
4406
4407 /* Non-indexed call or indexed with a real index buffer. */
4408 struct tc_draw_vstate_multi *p =
4409 tc_add_slot_based_call(tc, TC_CALL_draw_vstate_multi, tc_draw_vstate_multi, dr);
4410
4411 if (!take_vertex_state_ownership)
4412 tc_set_vertex_state_reference(&p->state, state);
4413 else
4414 p->state = state;
4415
4416 take_vertex_state_ownership = false;
4417 p->partial_velem_mask = partial_velem_mask;
4418 p->info.mode = info.mode;
4419 p->info.take_vertex_state_ownership = false;
4420 p->num_draws = dr;
4421 memcpy(p->slot, &draws[total_offset], sizeof(draws[0]) * dr);
4422 num_draws -= dr;
4423
4424 total_offset += dr;
4425 }
4426
4427
4428 /* This must be after tc_add_*call, which can flush the batch. */
4429 if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
4430 tc_add_all_gfx_bindings_to_buffer_list(tc);
4431 }
4432
4433 struct tc_launch_grid_call {
4434 struct tc_call_base base;
4435 struct pipe_grid_info info;
4436 };
4437
4438 static uint16_t
tc_call_launch_grid(struct pipe_context * pipe,void * call)4439 tc_call_launch_grid(struct pipe_context *pipe, void *call)
4440 {
4441 struct pipe_grid_info *p = &to_call(call, tc_launch_grid_call)->info;
4442
4443 pipe->launch_grid(pipe, p);
4444 tc_drop_resource_reference(p->indirect);
4445 return call_size(tc_launch_grid_call);
4446 }
4447
4448 static void
tc_launch_grid(struct pipe_context * _pipe,const struct pipe_grid_info * info)4449 tc_launch_grid(struct pipe_context *_pipe,
4450 const struct pipe_grid_info *info)
4451 {
4452 struct threaded_context *tc = threaded_context(_pipe);
4453 struct tc_launch_grid_call *p = tc_add_call(tc, TC_CALL_launch_grid,
4454 tc_launch_grid_call);
4455 assert(info->input == NULL);
4456
4457 tc_set_resource_reference(&p->info.indirect, info->indirect);
4458 memcpy(&p->info, info, sizeof(*info));
4459
4460 if (info->indirect)
4461 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], info->indirect);
4462
4463 /* This must be after tc_add_*call, which can flush the batch. */
4464 if (unlikely(tc->add_all_compute_bindings_to_buffer_list))
4465 tc_add_all_compute_bindings_to_buffer_list(tc);
4466 }
4467
4468 static uint16_t
tc_call_resource_copy_region(struct pipe_context * pipe,void * call)4469 tc_call_resource_copy_region(struct pipe_context *pipe, void *call)
4470 {
4471 struct tc_resource_copy_region *p = to_call(call, tc_resource_copy_region);
4472
4473 pipe->resource_copy_region(pipe, p->dst, p->dst_level, p->dstx, p->dsty,
4474 p->dstz, p->src, p->src_level, &p->src_box);
4475 tc_drop_resource_reference(p->dst);
4476 tc_drop_resource_reference(p->src);
4477 return call_size(tc_resource_copy_region);
4478 }
4479
4480 static void
tc_resource_copy_region(struct pipe_context * _pipe,struct pipe_resource * dst,unsigned dst_level,unsigned dstx,unsigned dsty,unsigned dstz,struct pipe_resource * src,unsigned src_level,const struct pipe_box * src_box)4481 tc_resource_copy_region(struct pipe_context *_pipe,
4482 struct pipe_resource *dst, unsigned dst_level,
4483 unsigned dstx, unsigned dsty, unsigned dstz,
4484 struct pipe_resource *src, unsigned src_level,
4485 const struct pipe_box *src_box)
4486 {
4487 struct threaded_context *tc = threaded_context(_pipe);
4488 struct threaded_resource *tdst = threaded_resource(dst);
4489 struct tc_resource_copy_region *p =
4490 tc_add_call(tc, TC_CALL_resource_copy_region,
4491 tc_resource_copy_region);
4492
4493 if (dst->target == PIPE_BUFFER)
4494 tc_buffer_disable_cpu_storage(dst);
4495
4496 tc_set_resource_batch_usage(tc, dst);
4497 tc_set_resource_reference(&p->dst, dst);
4498 p->dst_level = dst_level;
4499 p->dstx = dstx;
4500 p->dsty = dsty;
4501 p->dstz = dstz;
4502 tc_set_resource_batch_usage(tc, src);
4503 tc_set_resource_reference(&p->src, src);
4504 p->src_level = src_level;
4505 p->src_box = *src_box;
4506
4507 if (dst->target == PIPE_BUFFER) {
4508 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
4509
4510 tc_add_to_buffer_list(next, src);
4511 tc_add_to_buffer_list(next, dst);
4512
4513 util_range_add(&tdst->b, &tdst->valid_buffer_range,
4514 dstx, dstx + src_box->width);
4515 }
4516 }
4517
4518 struct tc_blit_call {
4519 struct tc_call_base base;
4520 struct pipe_blit_info info;
4521 };
4522
4523 static uint16_t
tc_call_blit(struct pipe_context * pipe,void * call)4524 tc_call_blit(struct pipe_context *pipe, void *call)
4525 {
4526 struct pipe_blit_info *blit = &to_call(call, tc_blit_call)->info;
4527
4528 pipe->blit(pipe, blit);
4529 tc_drop_resource_reference(blit->dst.resource);
4530 tc_drop_resource_reference(blit->src.resource);
4531 return call_size(tc_blit_call);
4532 }
4533
4534 static void
tc_blit_enqueue(struct threaded_context * tc,const struct pipe_blit_info * info)4535 tc_blit_enqueue(struct threaded_context *tc, const struct pipe_blit_info *info)
4536 {
4537 struct tc_blit_call *blit = tc_add_call(tc, TC_CALL_blit, tc_blit_call);
4538
4539 tc_set_resource_batch_usage(tc, info->dst.resource);
4540 tc_set_resource_reference(&blit->info.dst.resource, info->dst.resource);
4541 tc_set_resource_batch_usage(tc, info->src.resource);
4542 tc_set_resource_reference(&blit->info.src.resource, info->src.resource);
4543 memcpy(&blit->info, info, sizeof(*info));
4544 }
4545
4546 static void
tc_blit(struct pipe_context * _pipe,const struct pipe_blit_info * info)4547 tc_blit(struct pipe_context *_pipe, const struct pipe_blit_info *info)
4548 {
4549 struct threaded_context *tc = threaded_context(_pipe);
4550
4551 /* filter out untracked non-resolves */
4552 if (!tc->options.parse_renderpass_info ||
4553 info->src.resource->nr_samples <= 1 ||
4554 info->dst.resource->nr_samples > 1) {
4555 tc_blit_enqueue(tc, info);
4556 return;
4557 }
4558
4559 if (tc->fb_resolve == info->dst.resource) {
4560 /* optimize out this blit entirely */
4561 tc->renderpass_info_recording->has_resolve = true;
4562 return;
4563 }
4564 for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
4565 if (tc->fb_resources[i] == info->src.resource) {
4566 tc->renderpass_info_recording->has_resolve = true;
4567 break;
4568 }
4569 }
4570 tc_blit_enqueue(tc, info);
4571 }
4572
4573 struct tc_generate_mipmap {
4574 struct tc_call_base base;
4575 enum pipe_format format;
4576 unsigned base_level;
4577 unsigned last_level;
4578 unsigned first_layer;
4579 unsigned last_layer;
4580 struct pipe_resource *res;
4581 };
4582
4583 static uint16_t
tc_call_generate_mipmap(struct pipe_context * pipe,void * call)4584 tc_call_generate_mipmap(struct pipe_context *pipe, void *call)
4585 {
4586 struct tc_generate_mipmap *p = to_call(call, tc_generate_mipmap);
4587 ASSERTED bool result = pipe->generate_mipmap(pipe, p->res, p->format,
4588 p->base_level,
4589 p->last_level,
4590 p->first_layer,
4591 p->last_layer);
4592 assert(result);
4593 tc_drop_resource_reference(p->res);
4594 return call_size(tc_generate_mipmap);
4595 }
4596
4597 static bool
tc_generate_mipmap(struct pipe_context * _pipe,struct pipe_resource * res,enum pipe_format format,unsigned base_level,unsigned last_level,unsigned first_layer,unsigned last_layer)4598 tc_generate_mipmap(struct pipe_context *_pipe,
4599 struct pipe_resource *res,
4600 enum pipe_format format,
4601 unsigned base_level,
4602 unsigned last_level,
4603 unsigned first_layer,
4604 unsigned last_layer)
4605 {
4606 struct threaded_context *tc = threaded_context(_pipe);
4607 struct pipe_context *pipe = tc->pipe;
4608 struct pipe_screen *screen = pipe->screen;
4609 unsigned bind = PIPE_BIND_SAMPLER_VIEW;
4610
4611 if (util_format_is_depth_or_stencil(format))
4612 bind = PIPE_BIND_DEPTH_STENCIL;
4613 else
4614 bind = PIPE_BIND_RENDER_TARGET;
4615
4616 if (!screen->is_format_supported(screen, format, res->target,
4617 res->nr_samples, res->nr_storage_samples,
4618 bind))
4619 return false;
4620
4621 struct tc_generate_mipmap *p =
4622 tc_add_call(tc, TC_CALL_generate_mipmap, tc_generate_mipmap);
4623
4624 tc_set_resource_batch_usage(tc, res);
4625 tc_set_resource_reference(&p->res, res);
4626 p->format = format;
4627 p->base_level = base_level;
4628 p->last_level = last_level;
4629 p->first_layer = first_layer;
4630 p->last_layer = last_layer;
4631 return true;
4632 }
4633
4634 struct tc_resource_call {
4635 struct tc_call_base base;
4636 struct pipe_resource *resource;
4637 };
4638
4639 static uint16_t
tc_call_flush_resource(struct pipe_context * pipe,void * call)4640 tc_call_flush_resource(struct pipe_context *pipe, void *call)
4641 {
4642 struct pipe_resource *resource = to_call(call, tc_resource_call)->resource;
4643
4644 pipe->flush_resource(pipe, resource);
4645 tc_drop_resource_reference(resource);
4646 return call_size(tc_resource_call);
4647 }
4648
4649 static void
tc_flush_resource(struct pipe_context * _pipe,struct pipe_resource * resource)4650 tc_flush_resource(struct pipe_context *_pipe, struct pipe_resource *resource)
4651 {
4652 struct threaded_context *tc = threaded_context(_pipe);
4653 struct tc_resource_call *call = tc_add_call(tc, TC_CALL_flush_resource,
4654 tc_resource_call);
4655
4656 tc_set_resource_batch_usage(tc, resource);
4657 tc_set_resource_reference(&call->resource, resource);
4658 }
4659
4660 static uint16_t
tc_call_invalidate_resource(struct pipe_context * pipe,void * call)4661 tc_call_invalidate_resource(struct pipe_context *pipe, void *call)
4662 {
4663 struct pipe_resource *resource = to_call(call, tc_resource_call)->resource;
4664
4665 pipe->invalidate_resource(pipe, resource);
4666 tc_drop_resource_reference(resource);
4667 return call_size(tc_resource_call);
4668 }
4669
4670 static void
tc_invalidate_resource(struct pipe_context * _pipe,struct pipe_resource * resource)4671 tc_invalidate_resource(struct pipe_context *_pipe,
4672 struct pipe_resource *resource)
4673 {
4674 struct threaded_context *tc = threaded_context(_pipe);
4675
4676 if (resource->target == PIPE_BUFFER) {
4677 tc_invalidate_buffer(tc, threaded_resource(resource));
4678 return;
4679 }
4680
4681 struct tc_resource_call *call = tc_add_call(tc, TC_CALL_invalidate_resource,
4682 tc_resource_call);
4683 tc_set_resource_batch_usage(tc, resource);
4684 tc_set_resource_reference(&call->resource, resource);
4685
4686 struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
4687 if (info) {
4688 if (tc->fb_resources[PIPE_MAX_COLOR_BUFS] == resource) {
4689 info->zsbuf_invalidate = true;
4690 } else {
4691 for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
4692 if (tc->fb_resources[i] == resource)
4693 info->cbuf_invalidate |= BITFIELD_BIT(i);
4694 }
4695 }
4696 }
4697 }
4698
4699 struct tc_clear {
4700 struct tc_call_base base;
4701 bool scissor_state_set;
4702 uint8_t stencil;
4703 uint16_t buffers;
4704 float depth;
4705 struct pipe_scissor_state scissor_state;
4706 union pipe_color_union color;
4707 };
4708
4709 static uint16_t
tc_call_clear(struct pipe_context * pipe,void * call)4710 tc_call_clear(struct pipe_context *pipe, void *call)
4711 {
4712 struct tc_clear *p = to_call(call, tc_clear);
4713
4714 pipe->clear(pipe, p->buffers, p->scissor_state_set ? &p->scissor_state : NULL, &p->color, p->depth, p->stencil);
4715 return call_size(tc_clear);
4716 }
4717
4718 static void
tc_clear(struct pipe_context * _pipe,unsigned buffers,const struct pipe_scissor_state * scissor_state,const union pipe_color_union * color,double depth,unsigned stencil)4719 tc_clear(struct pipe_context *_pipe, unsigned buffers, const struct pipe_scissor_state *scissor_state,
4720 const union pipe_color_union *color, double depth,
4721 unsigned stencil)
4722 {
4723 struct threaded_context *tc = threaded_context(_pipe);
4724 struct tc_clear *p = tc_add_call(tc, TC_CALL_clear, tc_clear);
4725
4726 p->buffers = buffers;
4727 if (scissor_state) {
4728 p->scissor_state = *scissor_state;
4729 struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
4730 /* partial clear info is useful for drivers to know whether any zs writes occur;
4731 * drivers are responsible for optimizing partial clear -> full clear
4732 */
4733 if (info && buffers & PIPE_CLEAR_DEPTHSTENCIL)
4734 info->zsbuf_clear_partial |= !info->zsbuf_clear;
4735 } else {
4736 struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
4737 if (info) {
4738 /* full clears use a different load operation, but are only valid if draws haven't occurred yet */
4739 info->cbuf_clear |= (buffers >> 2) & ~info->cbuf_load;
4740 if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
4741 if (!info->zsbuf_load && !info->zsbuf_clear_partial)
4742 info->zsbuf_clear = true;
4743 else if (!info->zsbuf_clear)
4744 /* this is a clear that occurred after a draw: flag as partial to ensure it isn't ignored */
4745 info->zsbuf_clear_partial = true;
4746 }
4747 }
4748 }
4749 p->scissor_state_set = !!scissor_state;
4750 p->color = *color;
4751 p->depth = depth;
4752 p->stencil = stencil;
4753 }
4754
4755 struct tc_clear_render_target {
4756 struct tc_call_base base;
4757 bool render_condition_enabled;
4758 unsigned dstx;
4759 unsigned dsty;
4760 unsigned width;
4761 unsigned height;
4762 union pipe_color_union color;
4763 struct pipe_surface *dst;
4764 };
4765
4766 static uint16_t
tc_call_clear_render_target(struct pipe_context * pipe,void * call)4767 tc_call_clear_render_target(struct pipe_context *pipe, void *call)
4768 {
4769 struct tc_clear_render_target *p = to_call(call, tc_clear_render_target);
4770
4771 pipe->clear_render_target(pipe, p->dst, &p->color, p->dstx, p->dsty, p->width, p->height,
4772 p->render_condition_enabled);
4773 tc_drop_surface_reference(p->dst);
4774 return call_size(tc_clear_render_target);
4775 }
4776
4777 static void
tc_clear_render_target(struct pipe_context * _pipe,struct pipe_surface * dst,const union pipe_color_union * color,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)4778 tc_clear_render_target(struct pipe_context *_pipe,
4779 struct pipe_surface *dst,
4780 const union pipe_color_union *color,
4781 unsigned dstx, unsigned dsty,
4782 unsigned width, unsigned height,
4783 bool render_condition_enabled)
4784 {
4785 struct threaded_context *tc = threaded_context(_pipe);
4786 struct tc_clear_render_target *p = tc_add_call(tc, TC_CALL_clear_render_target, tc_clear_render_target);
4787 p->dst = NULL;
4788 pipe_surface_reference(&p->dst, dst);
4789 p->color = *color;
4790 p->dstx = dstx;
4791 p->dsty = dsty;
4792 p->width = width;
4793 p->height = height;
4794 p->render_condition_enabled = render_condition_enabled;
4795 }
4796
4797
4798 struct tc_clear_depth_stencil {
4799 struct tc_call_base base;
4800 bool render_condition_enabled;
4801 float depth;
4802 unsigned clear_flags;
4803 unsigned stencil;
4804 unsigned dstx;
4805 unsigned dsty;
4806 unsigned width;
4807 unsigned height;
4808 struct pipe_surface *dst;
4809 };
4810
4811
4812 static uint16_t
tc_call_clear_depth_stencil(struct pipe_context * pipe,void * call)4813 tc_call_clear_depth_stencil(struct pipe_context *pipe, void *call)
4814 {
4815 struct tc_clear_depth_stencil *p = to_call(call, tc_clear_depth_stencil);
4816
4817 pipe->clear_depth_stencil(pipe, p->dst, p->clear_flags, p->depth, p->stencil,
4818 p->dstx, p->dsty, p->width, p->height,
4819 p->render_condition_enabled);
4820 tc_drop_surface_reference(p->dst);
4821 return call_size(tc_clear_depth_stencil);
4822 }
4823
4824 static void
tc_clear_depth_stencil(struct pipe_context * _pipe,struct pipe_surface * dst,unsigned clear_flags,double depth,unsigned stencil,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)4825 tc_clear_depth_stencil(struct pipe_context *_pipe,
4826 struct pipe_surface *dst, unsigned clear_flags,
4827 double depth, unsigned stencil, unsigned dstx,
4828 unsigned dsty, unsigned width, unsigned height,
4829 bool render_condition_enabled)
4830 {
4831 struct threaded_context *tc = threaded_context(_pipe);
4832 struct tc_clear_depth_stencil *p = tc_add_call(tc, TC_CALL_clear_depth_stencil, tc_clear_depth_stencil);
4833 p->dst = NULL;
4834 pipe_surface_reference(&p->dst, dst);
4835 p->clear_flags = clear_flags;
4836 p->depth = depth;
4837 p->stencil = stencil;
4838 p->dstx = dstx;
4839 p->dsty = dsty;
4840 p->width = width;
4841 p->height = height;
4842 p->render_condition_enabled = render_condition_enabled;
4843 }
4844
4845 struct tc_clear_buffer {
4846 struct tc_call_base base;
4847 uint8_t clear_value_size;
4848 unsigned offset;
4849 unsigned size;
4850 char clear_value[16];
4851 struct pipe_resource *res;
4852 };
4853
4854 static uint16_t
tc_call_clear_buffer(struct pipe_context * pipe,void * call)4855 tc_call_clear_buffer(struct pipe_context *pipe, void *call)
4856 {
4857 struct tc_clear_buffer *p = to_call(call, tc_clear_buffer);
4858
4859 pipe->clear_buffer(pipe, p->res, p->offset, p->size, p->clear_value,
4860 p->clear_value_size);
4861 tc_drop_resource_reference(p->res);
4862 return call_size(tc_clear_buffer);
4863 }
4864
4865 static void
tc_clear_buffer(struct pipe_context * _pipe,struct pipe_resource * res,unsigned offset,unsigned size,const void * clear_value,int clear_value_size)4866 tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res,
4867 unsigned offset, unsigned size,
4868 const void *clear_value, int clear_value_size)
4869 {
4870 struct threaded_context *tc = threaded_context(_pipe);
4871 struct threaded_resource *tres = threaded_resource(res);
4872 struct tc_clear_buffer *p =
4873 tc_add_call(tc, TC_CALL_clear_buffer, tc_clear_buffer);
4874
4875 tc_buffer_disable_cpu_storage(res);
4876
4877 tc_set_resource_reference(&p->res, res);
4878 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], res);
4879 p->offset = offset;
4880 p->size = size;
4881 memcpy(p->clear_value, clear_value, clear_value_size);
4882 p->clear_value_size = clear_value_size;
4883
4884 util_range_add(&tres->b, &tres->valid_buffer_range, offset, offset + size);
4885 }
4886
4887 struct tc_clear_texture {
4888 struct tc_call_base base;
4889 unsigned level;
4890 struct pipe_box box;
4891 char data[16];
4892 struct pipe_resource *res;
4893 };
4894
4895 static uint16_t
tc_call_clear_texture(struct pipe_context * pipe,void * call)4896 tc_call_clear_texture(struct pipe_context *pipe, void *call)
4897 {
4898 struct tc_clear_texture *p = to_call(call, tc_clear_texture);
4899
4900 pipe->clear_texture(pipe, p->res, p->level, &p->box, p->data);
4901 tc_drop_resource_reference(p->res);
4902 return call_size(tc_clear_texture);
4903 }
4904
4905 static void
tc_clear_texture(struct pipe_context * _pipe,struct pipe_resource * res,unsigned level,const struct pipe_box * box,const void * data)4906 tc_clear_texture(struct pipe_context *_pipe, struct pipe_resource *res,
4907 unsigned level, const struct pipe_box *box, const void *data)
4908 {
4909 struct threaded_context *tc = threaded_context(_pipe);
4910 struct tc_clear_texture *p =
4911 tc_add_call(tc, TC_CALL_clear_texture, tc_clear_texture);
4912
4913 tc_set_resource_batch_usage(tc, res);
4914 tc_set_resource_reference(&p->res, res);
4915 p->level = level;
4916 p->box = *box;
4917 memcpy(p->data, data,
4918 util_format_get_blocksize(res->format));
4919 }
4920
4921 struct tc_resource_commit {
4922 struct tc_call_base base;
4923 bool commit;
4924 unsigned level;
4925 struct pipe_box box;
4926 struct pipe_resource *res;
4927 };
4928
4929 static uint16_t
tc_call_resource_commit(struct pipe_context * pipe,void * call)4930 tc_call_resource_commit(struct pipe_context *pipe, void *call)
4931 {
4932 struct tc_resource_commit *p = to_call(call, tc_resource_commit);
4933
4934 pipe->resource_commit(pipe, p->res, p->level, &p->box, p->commit);
4935 tc_drop_resource_reference(p->res);
4936 return call_size(tc_resource_commit);
4937 }
4938
4939 static bool
tc_resource_commit(struct pipe_context * _pipe,struct pipe_resource * res,unsigned level,struct pipe_box * box,bool commit)4940 tc_resource_commit(struct pipe_context *_pipe, struct pipe_resource *res,
4941 unsigned level, struct pipe_box *box, bool commit)
4942 {
4943 struct threaded_context *tc = threaded_context(_pipe);
4944 struct tc_resource_commit *p =
4945 tc_add_call(tc, TC_CALL_resource_commit, tc_resource_commit);
4946
4947 tc_set_resource_reference(&p->res, res);
4948 tc_set_resource_batch_usage(tc, res);
4949 p->level = level;
4950 p->box = *box;
4951 p->commit = commit;
4952 return true; /* we don't care about the return value for this call */
4953 }
4954
4955 static unsigned
tc_init_intel_perf_query_info(struct pipe_context * _pipe)4956 tc_init_intel_perf_query_info(struct pipe_context *_pipe)
4957 {
4958 struct threaded_context *tc = threaded_context(_pipe);
4959 struct pipe_context *pipe = tc->pipe;
4960
4961 return pipe->init_intel_perf_query_info(pipe);
4962 }
4963
4964 static void
tc_get_intel_perf_query_info(struct pipe_context * _pipe,unsigned query_index,const char ** name,uint32_t * data_size,uint32_t * n_counters,uint32_t * n_active)4965 tc_get_intel_perf_query_info(struct pipe_context *_pipe,
4966 unsigned query_index,
4967 const char **name,
4968 uint32_t *data_size,
4969 uint32_t *n_counters,
4970 uint32_t *n_active)
4971 {
4972 struct threaded_context *tc = threaded_context(_pipe);
4973 struct pipe_context *pipe = tc->pipe;
4974
4975 tc_sync(tc); /* n_active vs begin/end_intel_perf_query */
4976 pipe->get_intel_perf_query_info(pipe, query_index, name, data_size,
4977 n_counters, n_active);
4978 }
4979
4980 static void
tc_get_intel_perf_query_counter_info(struct pipe_context * _pipe,unsigned query_index,unsigned counter_index,const char ** name,const char ** desc,uint32_t * offset,uint32_t * data_size,uint32_t * type_enum,uint32_t * data_type_enum,uint64_t * raw_max)4981 tc_get_intel_perf_query_counter_info(struct pipe_context *_pipe,
4982 unsigned query_index,
4983 unsigned counter_index,
4984 const char **name,
4985 const char **desc,
4986 uint32_t *offset,
4987 uint32_t *data_size,
4988 uint32_t *type_enum,
4989 uint32_t *data_type_enum,
4990 uint64_t *raw_max)
4991 {
4992 struct threaded_context *tc = threaded_context(_pipe);
4993 struct pipe_context *pipe = tc->pipe;
4994
4995 pipe->get_intel_perf_query_counter_info(pipe, query_index, counter_index,
4996 name, desc, offset, data_size, type_enum, data_type_enum, raw_max);
4997 }
4998
4999 static struct pipe_query *
tc_new_intel_perf_query_obj(struct pipe_context * _pipe,unsigned query_index)5000 tc_new_intel_perf_query_obj(struct pipe_context *_pipe, unsigned query_index)
5001 {
5002 struct threaded_context *tc = threaded_context(_pipe);
5003 struct pipe_context *pipe = tc->pipe;
5004
5005 return pipe->new_intel_perf_query_obj(pipe, query_index);
5006 }
5007
5008 static uint16_t
tc_call_begin_intel_perf_query(struct pipe_context * pipe,void * call)5009 tc_call_begin_intel_perf_query(struct pipe_context *pipe, void *call)
5010 {
5011 (void)pipe->begin_intel_perf_query(pipe, to_call(call, tc_query_call)->query);
5012 return call_size(tc_query_call);
5013 }
5014
5015 static bool
tc_begin_intel_perf_query(struct pipe_context * _pipe,struct pipe_query * q)5016 tc_begin_intel_perf_query(struct pipe_context *_pipe, struct pipe_query *q)
5017 {
5018 struct threaded_context *tc = threaded_context(_pipe);
5019
5020 tc_add_call(tc, TC_CALL_begin_intel_perf_query, tc_query_call)->query = q;
5021
5022 /* assume success, begin failure can be signaled from get_intel_perf_query_data */
5023 return true;
5024 }
5025
5026 static uint16_t
tc_call_end_intel_perf_query(struct pipe_context * pipe,void * call)5027 tc_call_end_intel_perf_query(struct pipe_context *pipe, void *call)
5028 {
5029 pipe->end_intel_perf_query(pipe, to_call(call, tc_query_call)->query);
5030 return call_size(tc_query_call);
5031 }
5032
5033 static void
tc_end_intel_perf_query(struct pipe_context * _pipe,struct pipe_query * q)5034 tc_end_intel_perf_query(struct pipe_context *_pipe, struct pipe_query *q)
5035 {
5036 struct threaded_context *tc = threaded_context(_pipe);
5037
5038 tc_add_call(tc, TC_CALL_end_intel_perf_query, tc_query_call)->query = q;
5039 }
5040
5041 static void
tc_delete_intel_perf_query(struct pipe_context * _pipe,struct pipe_query * q)5042 tc_delete_intel_perf_query(struct pipe_context *_pipe, struct pipe_query *q)
5043 {
5044 struct threaded_context *tc = threaded_context(_pipe);
5045 struct pipe_context *pipe = tc->pipe;
5046
5047 tc_sync(tc); /* flush potentially pending begin/end_intel_perf_queries */
5048 pipe->delete_intel_perf_query(pipe, q);
5049 }
5050
5051 static void
tc_wait_intel_perf_query(struct pipe_context * _pipe,struct pipe_query * q)5052 tc_wait_intel_perf_query(struct pipe_context *_pipe, struct pipe_query *q)
5053 {
5054 struct threaded_context *tc = threaded_context(_pipe);
5055 struct pipe_context *pipe = tc->pipe;
5056
5057 tc_sync(tc); /* flush potentially pending begin/end_intel_perf_queries */
5058 pipe->wait_intel_perf_query(pipe, q);
5059 }
5060
5061 static bool
tc_is_intel_perf_query_ready(struct pipe_context * _pipe,struct pipe_query * q)5062 tc_is_intel_perf_query_ready(struct pipe_context *_pipe, struct pipe_query *q)
5063 {
5064 struct threaded_context *tc = threaded_context(_pipe);
5065 struct pipe_context *pipe = tc->pipe;
5066
5067 tc_sync(tc); /* flush potentially pending begin/end_intel_perf_queries */
5068 return pipe->is_intel_perf_query_ready(pipe, q);
5069 }
5070
5071 static bool
tc_get_intel_perf_query_data(struct pipe_context * _pipe,struct pipe_query * q,size_t data_size,uint32_t * data,uint32_t * bytes_written)5072 tc_get_intel_perf_query_data(struct pipe_context *_pipe,
5073 struct pipe_query *q,
5074 size_t data_size,
5075 uint32_t *data,
5076 uint32_t *bytes_written)
5077 {
5078 struct threaded_context *tc = threaded_context(_pipe);
5079 struct pipe_context *pipe = tc->pipe;
5080
5081 tc_sync(tc); /* flush potentially pending begin/end_intel_perf_queries */
5082 return pipe->get_intel_perf_query_data(pipe, q, data_size, data, bytes_written);
5083 }
5084
5085 /********************************************************************
5086 * callback
5087 */
5088
5089 struct tc_callback_call {
5090 struct tc_call_base base;
5091 void (*fn)(void *data);
5092 void *data;
5093 };
5094
5095 static uint16_t
tc_call_callback(UNUSED struct pipe_context * pipe,void * call)5096 tc_call_callback(UNUSED struct pipe_context *pipe, void *call)
5097 {
5098 struct tc_callback_call *p = to_call(call, tc_callback_call);
5099
5100 p->fn(p->data);
5101 return call_size(tc_callback_call);
5102 }
5103
5104 static void
tc_callback(struct pipe_context * _pipe,void (* fn)(void *),void * data,bool asap)5105 tc_callback(struct pipe_context *_pipe, void (*fn)(void *), void *data,
5106 bool asap)
5107 {
5108 struct threaded_context *tc = threaded_context(_pipe);
5109
5110 if (asap && tc_is_sync(tc)) {
5111 fn(data);
5112 return;
5113 }
5114
5115 struct tc_callback_call *p =
5116 tc_add_call(tc, TC_CALL_callback, tc_callback_call);
5117 p->fn = fn;
5118 p->data = data;
5119 }
5120
5121
5122 /********************************************************************
5123 * create & destroy
5124 */
5125
5126 static void
tc_destroy(struct pipe_context * _pipe)5127 tc_destroy(struct pipe_context *_pipe)
5128 {
5129 struct threaded_context *tc = threaded_context(_pipe);
5130 struct pipe_context *pipe = tc->pipe;
5131
5132 if (tc->base.const_uploader &&
5133 tc->base.stream_uploader != tc->base.const_uploader)
5134 u_upload_destroy(tc->base.const_uploader);
5135
5136 if (tc->base.stream_uploader)
5137 u_upload_destroy(tc->base.stream_uploader);
5138
5139 tc_sync(tc);
5140
5141 if (util_queue_is_initialized(&tc->queue)) {
5142 util_queue_destroy(&tc->queue);
5143
5144 for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
5145 util_queue_fence_destroy(&tc->batch_slots[i].fence);
5146 util_dynarray_fini(&tc->batch_slots[i].renderpass_infos);
5147 assert(!tc->batch_slots[i].token);
5148 }
5149 }
5150
5151 slab_destroy_child(&tc->pool_transfers);
5152 assert(tc->batch_slots[tc->next].num_total_slots == 0);
5153 pipe->destroy(pipe);
5154
5155 for (unsigned i = 0; i < TC_MAX_BUFFER_LISTS; i++) {
5156 if (!util_queue_fence_is_signalled(&tc->buffer_lists[i].driver_flushed_fence))
5157 util_queue_fence_signal(&tc->buffer_lists[i].driver_flushed_fence);
5158 util_queue_fence_destroy(&tc->buffer_lists[i].driver_flushed_fence);
5159 }
5160
5161 for (unsigned i = 0; i < ARRAY_SIZE(tc->fb_resources); i++)
5162 pipe_resource_reference(&tc->fb_resources[i], NULL);
5163 pipe_resource_reference(&tc->fb_resolve, NULL);
5164
5165 FREE(tc);
5166 }
5167
tc_driver_internal_flush_notify(struct threaded_context * tc)5168 void tc_driver_internal_flush_notify(struct threaded_context *tc)
5169 {
5170 /* Allow drivers to call this function even for internal contexts that
5171 * don't have tc. It simplifies drivers.
5172 */
5173 if (!tc)
5174 return;
5175
5176 /* Signal fences set by tc_batch_execute. */
5177 for (unsigned i = 0; i < tc->num_signal_fences_next_flush; i++)
5178 util_queue_fence_signal(tc->signal_fences_next_flush[i]);
5179
5180 tc->num_signal_fences_next_flush = 0;
5181 }
5182
5183 /**
5184 * Wrap an existing pipe_context into a threaded_context.
5185 *
5186 * \param pipe pipe_context to wrap
5187 * \param parent_transfer_pool parent slab pool set up for creating pipe_-
5188 * transfer objects; the driver should have one
5189 * in pipe_screen.
5190 * \param replace_buffer callback for replacing a pipe_resource's storage
5191 * with another pipe_resource's storage.
5192 * \param options optional TC options/callbacks
5193 * \param out if successful, the threaded_context will be returned here in
5194 * addition to the return value if "out" != NULL
5195 */
5196 struct pipe_context *
threaded_context_create(struct pipe_context * pipe,struct slab_parent_pool * parent_transfer_pool,tc_replace_buffer_storage_func replace_buffer,const struct threaded_context_options * options,struct threaded_context ** out)5197 threaded_context_create(struct pipe_context *pipe,
5198 struct slab_parent_pool *parent_transfer_pool,
5199 tc_replace_buffer_storage_func replace_buffer,
5200 const struct threaded_context_options *options,
5201 struct threaded_context **out)
5202 {
5203 struct threaded_context *tc;
5204
5205 if (!pipe)
5206 return NULL;
5207
5208 if (!debug_get_bool_option("GALLIUM_THREAD", true))
5209 return pipe;
5210
5211 tc = CALLOC_STRUCT(threaded_context);
5212 if (!tc) {
5213 pipe->destroy(pipe);
5214 return NULL;
5215 }
5216
5217 if (options) {
5218 /* this is unimplementable */
5219 assert(!(options->parse_renderpass_info && options->driver_calls_flush_notify));
5220 tc->options = *options;
5221 }
5222
5223 pipe = trace_context_create_threaded(pipe->screen, pipe, &replace_buffer, &tc->options);
5224
5225 /* The driver context isn't wrapped, so set its "priv" to NULL. */
5226 pipe->priv = NULL;
5227
5228 tc->pipe = pipe;
5229 tc->replace_buffer_storage = replace_buffer;
5230 tc->map_buffer_alignment =
5231 pipe->screen->caps.min_map_buffer_alignment;
5232 tc->ubo_alignment =
5233 MAX2(pipe->screen->caps.constant_buffer_offset_alignment, 64);
5234 tc->base.priv = pipe; /* priv points to the wrapped driver context */
5235 tc->base.screen = pipe->screen;
5236 tc->base.destroy = tc_destroy;
5237 tc->base.callback = tc_callback;
5238
5239 tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader);
5240 if (pipe->stream_uploader == pipe->const_uploader)
5241 tc->base.const_uploader = tc->base.stream_uploader;
5242 else
5243 tc->base.const_uploader = u_upload_clone(&tc->base, pipe->const_uploader);
5244
5245 if (!tc->base.stream_uploader || !tc->base.const_uploader)
5246 goto fail;
5247
5248 tc->use_forced_staging_uploads = true;
5249
5250 /* The queue size is the number of batches "waiting". Batches are removed
5251 * from the queue before being executed, so keep one tc_batch slot for that
5252 * execution. Also, keep one unused slot for an unflushed batch.
5253 */
5254 if (!util_queue_init(&tc->queue, "gdrv", TC_MAX_BATCHES - 2, 1, 0, NULL))
5255 goto fail;
5256
5257 tc->last_completed = -1;
5258 for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
5259 #if !defined(NDEBUG) && TC_DEBUG >= 1
5260 tc->batch_slots[i].sentinel = TC_SENTINEL;
5261 #endif
5262 tc->batch_slots[i].tc = tc;
5263 tc->batch_slots[i].batch_idx = i;
5264 util_queue_fence_init(&tc->batch_slots[i].fence);
5265 tc->batch_slots[i].renderpass_info_idx = -1;
5266 if (tc->options.parse_renderpass_info) {
5267 util_dynarray_init(&tc->batch_slots[i].renderpass_infos, NULL);
5268 tc_batch_renderpass_infos_resize(tc, &tc->batch_slots[i]);
5269 }
5270 }
5271 for (unsigned i = 0; i < TC_MAX_BUFFER_LISTS; i++)
5272 util_queue_fence_init(&tc->buffer_lists[i].driver_flushed_fence);
5273
5274 list_inithead(&tc->unflushed_queries);
5275
5276 slab_create_child(&tc->pool_transfers, parent_transfer_pool);
5277
5278 /* If you have different limits in each shader stage, set the maximum. */
5279 struct pipe_screen *screen = pipe->screen;;
5280 tc->max_const_buffers =
5281 screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
5282 PIPE_SHADER_CAP_MAX_CONST_BUFFERS);
5283 tc->max_shader_buffers =
5284 screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
5285 PIPE_SHADER_CAP_MAX_SHADER_BUFFERS);
5286 tc->max_images =
5287 screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
5288 PIPE_SHADER_CAP_MAX_SHADER_IMAGES);
5289 tc->max_samplers =
5290 screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
5291 PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS);
5292
5293 tc->base.set_context_param = tc_set_context_param; /* always set this */
5294
5295 #define CTX_INIT(_member) \
5296 tc->base._member = tc->pipe->_member ? tc_##_member : NULL
5297
5298 CTX_INIT(flush);
5299 CTX_INIT(draw_vbo);
5300 CTX_INIT(draw_vertex_state);
5301 CTX_INIT(launch_grid);
5302 CTX_INIT(resource_copy_region);
5303 CTX_INIT(blit);
5304 CTX_INIT(clear);
5305 CTX_INIT(clear_render_target);
5306 CTX_INIT(clear_depth_stencil);
5307 CTX_INIT(clear_buffer);
5308 CTX_INIT(clear_texture);
5309 CTX_INIT(flush_resource);
5310 CTX_INIT(generate_mipmap);
5311 CTX_INIT(render_condition);
5312 CTX_INIT(create_query);
5313 CTX_INIT(create_batch_query);
5314 CTX_INIT(destroy_query);
5315 CTX_INIT(begin_query);
5316 CTX_INIT(end_query);
5317 CTX_INIT(get_query_result);
5318 CTX_INIT(get_query_result_resource);
5319 CTX_INIT(set_active_query_state);
5320 CTX_INIT(create_blend_state);
5321 CTX_INIT(bind_blend_state);
5322 CTX_INIT(delete_blend_state);
5323 CTX_INIT(create_sampler_state);
5324 CTX_INIT(bind_sampler_states);
5325 CTX_INIT(delete_sampler_state);
5326 CTX_INIT(create_rasterizer_state);
5327 CTX_INIT(bind_rasterizer_state);
5328 CTX_INIT(delete_rasterizer_state);
5329 CTX_INIT(create_depth_stencil_alpha_state);
5330 CTX_INIT(bind_depth_stencil_alpha_state);
5331 CTX_INIT(delete_depth_stencil_alpha_state);
5332 CTX_INIT(link_shader);
5333 CTX_INIT(create_fs_state);
5334 CTX_INIT(bind_fs_state);
5335 CTX_INIT(delete_fs_state);
5336 CTX_INIT(create_vs_state);
5337 CTX_INIT(bind_vs_state);
5338 CTX_INIT(delete_vs_state);
5339 CTX_INIT(create_gs_state);
5340 CTX_INIT(bind_gs_state);
5341 CTX_INIT(delete_gs_state);
5342 CTX_INIT(create_tcs_state);
5343 CTX_INIT(bind_tcs_state);
5344 CTX_INIT(delete_tcs_state);
5345 CTX_INIT(create_tes_state);
5346 CTX_INIT(bind_tes_state);
5347 CTX_INIT(delete_tes_state);
5348 CTX_INIT(create_compute_state);
5349 CTX_INIT(bind_compute_state);
5350 CTX_INIT(delete_compute_state);
5351 CTX_INIT(create_vertex_elements_state);
5352 CTX_INIT(bind_vertex_elements_state);
5353 CTX_INIT(delete_vertex_elements_state);
5354 CTX_INIT(set_blend_color);
5355 CTX_INIT(set_stencil_ref);
5356 CTX_INIT(set_sample_mask);
5357 CTX_INIT(set_min_samples);
5358 CTX_INIT(set_clip_state);
5359 CTX_INIT(set_constant_buffer);
5360 CTX_INIT(set_inlinable_constants);
5361 CTX_INIT(set_framebuffer_state);
5362 CTX_INIT(set_polygon_stipple);
5363 CTX_INIT(set_sample_locations);
5364 CTX_INIT(set_scissor_states);
5365 CTX_INIT(set_viewport_states);
5366 CTX_INIT(set_window_rectangles);
5367 CTX_INIT(set_sampler_views);
5368 CTX_INIT(set_tess_state);
5369 CTX_INIT(set_patch_vertices);
5370 CTX_INIT(set_shader_buffers);
5371 CTX_INIT(set_shader_images);
5372 CTX_INIT(set_vertex_buffers);
5373 CTX_INIT(create_stream_output_target);
5374 CTX_INIT(stream_output_target_destroy);
5375 CTX_INIT(set_stream_output_targets);
5376 CTX_INIT(create_sampler_view);
5377 CTX_INIT(sampler_view_destroy);
5378 CTX_INIT(create_surface);
5379 CTX_INIT(surface_destroy);
5380 CTX_INIT(buffer_map);
5381 CTX_INIT(texture_map);
5382 CTX_INIT(transfer_flush_region);
5383 CTX_INIT(buffer_unmap);
5384 CTX_INIT(texture_unmap);
5385 CTX_INIT(buffer_subdata);
5386 CTX_INIT(texture_subdata);
5387 CTX_INIT(texture_barrier);
5388 CTX_INIT(memory_barrier);
5389 CTX_INIT(resource_commit);
5390 CTX_INIT(create_video_codec);
5391 CTX_INIT(create_video_buffer);
5392 CTX_INIT(set_compute_resources);
5393 CTX_INIT(set_global_binding);
5394 CTX_INIT(get_sample_position);
5395 CTX_INIT(invalidate_resource);
5396 CTX_INIT(get_device_reset_status);
5397 CTX_INIT(set_device_reset_callback);
5398 CTX_INIT(dump_debug_state);
5399 CTX_INIT(set_log_context);
5400 CTX_INIT(emit_string_marker);
5401 CTX_INIT(set_debug_callback);
5402 CTX_INIT(create_fence_fd);
5403 CTX_INIT(fence_server_sync);
5404 CTX_INIT(fence_server_signal);
5405 CTX_INIT(get_timestamp);
5406 CTX_INIT(create_texture_handle);
5407 CTX_INIT(delete_texture_handle);
5408 CTX_INIT(make_texture_handle_resident);
5409 CTX_INIT(create_image_handle);
5410 CTX_INIT(delete_image_handle);
5411 CTX_INIT(make_image_handle_resident);
5412 CTX_INIT(set_frontend_noop);
5413 CTX_INIT(init_intel_perf_query_info);
5414 CTX_INIT(get_intel_perf_query_info);
5415 CTX_INIT(get_intel_perf_query_counter_info);
5416 CTX_INIT(new_intel_perf_query_obj);
5417 CTX_INIT(begin_intel_perf_query);
5418 CTX_INIT(end_intel_perf_query);
5419 CTX_INIT(delete_intel_perf_query);
5420 CTX_INIT(wait_intel_perf_query);
5421 CTX_INIT(is_intel_perf_query_ready);
5422 CTX_INIT(get_intel_perf_query_data);
5423 #undef CTX_INIT
5424
5425 #define CALL(name) tc->execute_func[TC_CALL_##name] = tc_call_##name;
5426 #include "u_threaded_context_calls.h"
5427 #undef CALL
5428
5429 if (out)
5430 *out = tc;
5431
5432 tc_begin_next_buffer_list(tc);
5433 if (tc->options.parse_renderpass_info)
5434 tc_batch_increment_renderpass_info(tc, tc->next, false);
5435 return &tc->base;
5436
5437 fail:
5438 tc_destroy(&tc->base);
5439 return NULL;
5440 }
5441
5442 void
threaded_context_init_bytes_mapped_limit(struct threaded_context * tc,unsigned divisor)5443 threaded_context_init_bytes_mapped_limit(struct threaded_context *tc, unsigned divisor)
5444 {
5445 uint64_t total_ram;
5446 if (os_get_total_physical_memory(&total_ram)) {
5447 tc->bytes_mapped_limit = total_ram / divisor;
5448 if (sizeof(void*) == 4)
5449 tc->bytes_mapped_limit = MIN2(tc->bytes_mapped_limit, 512*1024*1024UL);
5450 }
5451 }
5452
5453 const struct tc_renderpass_info *
threaded_context_get_renderpass_info(struct threaded_context * tc)5454 threaded_context_get_renderpass_info(struct threaded_context *tc)
5455 {
5456 assert(tc->renderpass_info && tc->options.parse_renderpass_info);
5457 struct tc_batch_rp_info *info = tc_batch_rp_info(tc->renderpass_info);
5458 while (1) {
5459 util_queue_fence_wait(&info->ready);
5460 if (!info->next)
5461 return &info->info;
5462 info = info->next;
5463 }
5464 }
5465