1 /**************************************************************************
2 *
3 * Copyright 2017 Advanced Micro Devices, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * on the rights to use, copy, modify, merge, publish, distribute, sub
10 * license, and/or sell copies of the Software, and to permit persons to whom
11 * the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 *
25 **************************************************************************/
26
27 #include "util/u_threaded_context.h"
28 #include "util/u_cpu_detect.h"
29 #include "util/format/u_format.h"
30 #include "util/u_inlines.h"
31 #include "util/u_memory.h"
32 #include "util/u_upload_mgr.h"
33 #include "driver_trace/tr_context.h"
34 #include "util/log.h"
35 #include "util/perf/cpu_trace.h"
36 #include "util/thread_sched.h"
37 #include "compiler/shader_info.h"
38
39 #if TC_DEBUG >= 1
40 #define tc_assert assert
41 #else
42 #define tc_assert(x)
43 #endif
44
45 #if TC_DEBUG >= 2
46 #define tc_printf mesa_logi
47 #define tc_asprintf asprintf
48 #define tc_strcmp strcmp
49 #else
50 #define tc_printf(...)
51 #define tc_asprintf(...) 0
52 #define tc_strcmp(...) 0
53 #endif
54
55 #define TC_SENTINEL 0x5ca1ab1e
56
57 #if TC_DEBUG >= 3 || defined(TC_TRACE)
58 static const char *tc_call_names[] = {
59 #define CALL(name) #name,
60 #include "u_threaded_context_calls.h"
61 #undef CALL
62 };
63 #endif
64
65 #ifdef TC_TRACE
66 # define TC_TRACE_SCOPE(call_id) MESA_TRACE_SCOPE(tc_call_names[call_id])
67 #else
68 # define TC_TRACE_SCOPE(call_id)
69 #endif
70
71 enum tc_call_id {
72 #define CALL(name) TC_CALL_##name,
73 #include "u_threaded_context_calls.h"
74 #undef CALL
75 TC_NUM_CALLS,
76 TC_END_BATCH = TC_NUM_CALLS,
77 };
78
79 static void
80 tc_batch_execute(void *job, UNUSED void *gdata, int thread_index);
81
82 static void
83 tc_buffer_subdata(struct pipe_context *_pipe,
84 struct pipe_resource *resource,
85 unsigned usage, unsigned offset,
86 unsigned size, const void *data);
87
88 static void
tc_batch_check(UNUSED struct tc_batch * batch)89 tc_batch_check(UNUSED struct tc_batch *batch)
90 {
91 tc_assert(batch->sentinel == TC_SENTINEL);
92 tc_assert(batch->num_total_slots <= TC_SLOTS_PER_BATCH);
93 }
94
95 static void
tc_debug_check(struct threaded_context * tc)96 tc_debug_check(struct threaded_context *tc)
97 {
98 for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
99 tc_batch_check(&tc->batch_slots[i]);
100 tc_assert(tc->batch_slots[i].tc == tc);
101 }
102 }
103
104 static void
tc_set_driver_thread(struct threaded_context * tc)105 tc_set_driver_thread(struct threaded_context *tc)
106 {
107 #ifndef NDEBUG
108 tc->driver_thread = thrd_current();
109 #endif
110 }
111
112 static void
tc_clear_driver_thread(struct threaded_context * tc)113 tc_clear_driver_thread(struct threaded_context *tc)
114 {
115 #ifndef NDEBUG
116 memset(&tc->driver_thread, 0, sizeof(tc->driver_thread));
117 #endif
118 }
119
120 struct tc_batch_rp_info {
121 /* this is what drivers can see */
122 struct tc_renderpass_info info;
123 /* determines whether the info can be "safely" read by drivers or if it may still be in use */
124 struct util_queue_fence ready;
125 /* when a batch is full, the rp info rollsover onto 'next' */
126 struct tc_batch_rp_info *next;
127 /* when rp info has rolled over onto this struct, 'prev' is used to update pointers for realloc */
128 struct tc_batch_rp_info *prev;
129 };
130
131 static struct tc_batch_rp_info *
tc_batch_rp_info(struct tc_renderpass_info * info)132 tc_batch_rp_info(struct tc_renderpass_info *info)
133 {
134 return (struct tc_batch_rp_info *)info;
135 }
136
137 static void
tc_sanitize_renderpass_info(struct threaded_context * tc)138 tc_sanitize_renderpass_info(struct threaded_context *tc)
139 {
140 tc->renderpass_info_recording->cbuf_invalidate = 0;
141 tc->renderpass_info_recording->zsbuf_invalidate = false;
142 tc->renderpass_info_recording->cbuf_load |= (~tc->renderpass_info_recording->cbuf_clear) & BITFIELD_MASK(PIPE_MAX_COLOR_BUFS);
143 if (tc->fb_resources[PIPE_MAX_COLOR_BUFS] && !tc_renderpass_info_is_zsbuf_used(tc->renderpass_info_recording))
144 /* this should be a "safe" way to indicate to the driver that both loads and stores are required;
145 * driver can always detect invalidation
146 */
147 tc->renderpass_info_recording->zsbuf_clear_partial = true;
148 if (tc->num_queries_active)
149 tc->renderpass_info_recording->has_query_ends = true;
150 }
151
152 /* ensure the batch's array of renderpass data is large enough for the current index */
153 static void
tc_batch_renderpass_infos_resize(struct threaded_context * tc,struct tc_batch * batch)154 tc_batch_renderpass_infos_resize(struct threaded_context *tc, struct tc_batch *batch)
155 {
156 unsigned size = batch->renderpass_infos.capacity;
157 unsigned cur_num = MAX2(batch->renderpass_info_idx, 0);
158
159 if (size / sizeof(struct tc_batch_rp_info) > cur_num)
160 return;
161
162 struct tc_batch_rp_info *infos = batch->renderpass_infos.data;
163 unsigned old_idx = batch->renderpass_info_idx - 1;
164 bool redo = tc->renderpass_info_recording &&
165 tc->renderpass_info_recording == &infos[old_idx].info;
166 if (!util_dynarray_resize(&batch->renderpass_infos, struct tc_batch_rp_info, cur_num + 10))
167 mesa_loge("tc: memory alloc fail!");
168
169 if (size != batch->renderpass_infos.capacity) {
170 /* zero new allocation region */
171 uint8_t *data = batch->renderpass_infos.data;
172 memset(data + size, 0, batch->renderpass_infos.capacity - size);
173 unsigned start = size / sizeof(struct tc_batch_rp_info);
174 unsigned count = (batch->renderpass_infos.capacity - size) /
175 sizeof(struct tc_batch_rp_info);
176 infos = batch->renderpass_infos.data;
177 if (infos->prev)
178 infos->prev->next = infos;
179 for (unsigned i = 0; i < count; i++)
180 util_queue_fence_init(&infos[start + i].ready);
181 /* re-set current recording info on resize */
182 if (redo)
183 tc->renderpass_info_recording = &infos[old_idx].info;
184 }
185 }
186
187 /* signal that the renderpass info is "ready" for use by drivers and will no longer be updated */
188 static void
tc_signal_renderpass_info_ready(struct threaded_context * tc)189 tc_signal_renderpass_info_ready(struct threaded_context *tc)
190 {
191 if (tc->renderpass_info_recording &&
192 !util_queue_fence_is_signalled(&tc_batch_rp_info(tc->renderpass_info_recording)->ready))
193 util_queue_fence_signal(&tc_batch_rp_info(tc->renderpass_info_recording)->ready);
194 }
195
196 /* increment the current renderpass info struct for recording
197 * 'full_copy' is used for preserving data across non-blocking tc batch flushes
198 */
199 static void
tc_batch_increment_renderpass_info(struct threaded_context * tc,unsigned batch_idx,bool full_copy)200 tc_batch_increment_renderpass_info(struct threaded_context *tc, unsigned batch_idx, bool full_copy)
201 {
202 struct tc_batch *batch = &tc->batch_slots[batch_idx];
203 struct tc_batch_rp_info *tc_info = batch->renderpass_infos.data;
204
205 if (tc_info[0].next || batch->num_total_slots) {
206 /* deadlock condition detected: all batches are in flight, renderpass hasn't ended
207 * (probably a cts case)
208 */
209 struct tc_batch_rp_info *info = tc_batch_rp_info(tc->renderpass_info_recording);
210 if (!util_queue_fence_is_signalled(&info->ready)) {
211 /* this batch is actively executing and the driver is waiting on the recording fence to signal */
212 /* force all buffer usage to avoid data loss */
213 info->info.cbuf_load = ~(BITFIELD_MASK(8) & info->info.cbuf_clear);
214 info->info.zsbuf_clear_partial = true;
215 info->info.has_query_ends = tc->num_queries_active > 0;
216 /* ensure threaded_context_get_renderpass_info() won't deadlock */
217 info->next = NULL;
218 util_queue_fence_signal(&info->ready);
219 }
220 /* always wait on the batch to finish since this will otherwise overwrite thread data */
221 util_queue_fence_wait(&batch->fence);
222 }
223 /* increment rp info and initialize it */
224 batch->renderpass_info_idx++;
225 tc_batch_renderpass_infos_resize(tc, batch);
226 tc_info = batch->renderpass_infos.data;
227
228 if (full_copy) {
229 /* this should only be called when changing batches */
230 assert(batch->renderpass_info_idx == 0);
231 /* copy the previous data in its entirety: this is still the same renderpass */
232 if (tc->renderpass_info_recording) {
233 tc_info[batch->renderpass_info_idx].info.data = tc->renderpass_info_recording->data;
234 tc_batch_rp_info(tc->renderpass_info_recording)->next = &tc_info[batch->renderpass_info_idx];
235 tc_info[batch->renderpass_info_idx].prev = tc_batch_rp_info(tc->renderpass_info_recording);
236 /* guard against deadlock scenario */
237 assert(&tc_batch_rp_info(tc->renderpass_info_recording)->next->info != tc->renderpass_info_recording);
238 } else {
239 tc_info[batch->renderpass_info_idx].info.data = 0;
240 tc_info[batch->renderpass_info_idx].prev = NULL;
241 }
242 } else {
243 /* selectively copy: only the CSO metadata is copied, and a new framebuffer state will be added later */
244 tc_info[batch->renderpass_info_idx].info.data = 0;
245 if (tc->renderpass_info_recording) {
246 tc_info[batch->renderpass_info_idx].info.data16[2] = tc->renderpass_info_recording->data16[2];
247 tc_batch_rp_info(tc->renderpass_info_recording)->next = NULL;
248 tc_info[batch->renderpass_info_idx].prev = NULL;
249 }
250 }
251
252 assert(!full_copy || !tc->renderpass_info_recording || tc_batch_rp_info(tc->renderpass_info_recording)->next);
253 /* signal existing info since it will not be used anymore */
254 tc_signal_renderpass_info_ready(tc);
255 util_queue_fence_reset(&tc_info[batch->renderpass_info_idx].ready);
256 /* guard against deadlock scenario */
257 assert(tc->renderpass_info_recording != &tc_info[batch->renderpass_info_idx].info);
258 /* this is now the current recording renderpass info */
259 tc->renderpass_info_recording = &tc_info[batch->renderpass_info_idx].info;
260 batch->max_renderpass_info_idx = batch->renderpass_info_idx;
261 }
262
263 static ALWAYS_INLINE struct tc_renderpass_info *
tc_get_renderpass_info(struct threaded_context * tc)264 tc_get_renderpass_info(struct threaded_context *tc)
265 {
266 return tc->renderpass_info_recording;
267 }
268
269 /* update metadata at draw time */
270 static void
tc_parse_draw(struct threaded_context * tc)271 tc_parse_draw(struct threaded_context *tc)
272 {
273 struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
274
275 if (info) {
276 /* all buffers that aren't cleared are considered loaded */
277 info->cbuf_load |= ~info->cbuf_clear;
278 if (!info->zsbuf_clear)
279 info->zsbuf_load = true;
280 /* previous invalidates are no longer relevant */
281 info->cbuf_invalidate = 0;
282 info->zsbuf_invalidate = false;
283 info->has_draw = true;
284 info->has_query_ends |= tc->query_ended;
285 }
286
287 tc->in_renderpass = true;
288 tc->seen_fb_state = true;
289 tc->query_ended = false;
290 }
291
292 static void *
to_call_check(void * ptr,unsigned num_slots)293 to_call_check(void *ptr, unsigned num_slots)
294 {
295 #if TC_DEBUG >= 1
296 struct tc_call_base *call = ptr;
297 tc_assert(call->num_slots == num_slots);
298 #endif
299 return ptr;
300 }
301 #define to_call(ptr, type) ((struct type *)to_call_check((void *)(ptr), call_size(type)))
302
303 #define size_to_slots(size) DIV_ROUND_UP(size, 8)
304 #define call_size(type) size_to_slots(sizeof(struct type))
305 #define call_size_with_slots(type, num_slots) size_to_slots( \
306 sizeof(struct type) + sizeof(((struct type*)NULL)->slot[0]) * (num_slots))
307 #define get_next_call(ptr, type) ((struct type*)((uint64_t*)ptr + call_size(type)))
308
309 ALWAYS_INLINE static void
tc_set_resource_batch_usage(struct threaded_context * tc,struct pipe_resource * pres)310 tc_set_resource_batch_usage(struct threaded_context *tc, struct pipe_resource *pres)
311 {
312 /* ignore batch usage when persistent */
313 if (threaded_resource(pres)->last_batch_usage != INT8_MAX)
314 threaded_resource(pres)->last_batch_usage = tc->next;
315 threaded_resource(pres)->batch_generation = tc->batch_generation;
316 }
317
318 ALWAYS_INLINE static void
tc_set_resource_batch_usage_persistent(struct threaded_context * tc,struct pipe_resource * pres,bool enable)319 tc_set_resource_batch_usage_persistent(struct threaded_context *tc, struct pipe_resource *pres, bool enable)
320 {
321 if (!pres)
322 return;
323 /* mark with special value to block any unsynchronized access */
324 threaded_resource(pres)->last_batch_usage = enable ? INT8_MAX : tc->next;
325 threaded_resource(pres)->batch_generation = tc->batch_generation;
326 }
327
328 /* this can ONLY be used to check against the currently recording batch */
329 ALWAYS_INLINE static bool
tc_resource_batch_usage_test_busy(const struct threaded_context * tc,const struct pipe_resource * pres)330 tc_resource_batch_usage_test_busy(const struct threaded_context *tc, const struct pipe_resource *pres)
331 {
332 const struct threaded_resource *tbuf = (const struct threaded_resource*)pres;
333
334 if (!tc->options.unsynchronized_texture_subdata)
335 return true;
336
337 /* resource has persistent access: assume always busy */
338 if (tbuf->last_batch_usage == INT8_MAX)
339 return true;
340
341 /* resource has never been seen */
342 if (tbuf->last_batch_usage == -1)
343 return false;
344
345 /* resource has been seen but no batches have executed */
346 if (tc->last_completed == -1)
347 return true;
348
349 /* begin comparisons checking number of times batches have cycled */
350 unsigned diff = tc->batch_generation - tbuf->batch_generation;
351 /* resource has been seen, batches have fully cycled at least once */
352 if (diff > 1)
353 return false;
354
355 /* resource has been seen in current batch cycle: return whether batch has definitely completed */
356 if (diff == 0)
357 return tc->last_completed >= tbuf->last_batch_usage;
358
359 /* resource has been seen within one batch cycle: check for batch wrapping */
360 if (tc->last_completed >= tbuf->last_batch_usage)
361 /* this or a subsequent pre-wrap batch was the last to definitely complete: resource is idle */
362 return false;
363
364 /* batch execution has not definitely wrapped: resource is definitely not idle */
365 if (tc->last_completed > tc->next)
366 return true;
367
368 /* resource was seen pre-wrap, batch execution has definitely wrapped: idle */
369 if (tbuf->last_batch_usage > tc->last_completed)
370 return false;
371
372 /* tc->last_completed is not an exact measurement, so anything else is considered busy */
373 return true;
374 }
375
376 /* Assign src to dst while dst is uninitialized. */
377 static inline void
tc_set_resource_reference(struct pipe_resource ** dst,struct pipe_resource * src)378 tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource *src)
379 {
380 *dst = src;
381 pipe_reference(NULL, &src->reference); /* only increment refcount */
382 }
383
384 /* Assign src to dst while dst is uninitialized. */
385 static inline void
tc_set_vertex_state_reference(struct pipe_vertex_state ** dst,struct pipe_vertex_state * src)386 tc_set_vertex_state_reference(struct pipe_vertex_state **dst,
387 struct pipe_vertex_state *src)
388 {
389 *dst = src;
390 pipe_reference(NULL, &src->reference); /* only increment refcount */
391 }
392
393 /* Unreference dst but don't touch the dst pointer. */
394 static inline void
tc_drop_resource_reference(struct pipe_resource * dst)395 tc_drop_resource_reference(struct pipe_resource *dst)
396 {
397 if (pipe_reference(&dst->reference, NULL)) /* only decrement refcount */
398 pipe_resource_destroy(dst);
399 }
400
401 /* Unreference dst but don't touch the dst pointer. */
402 static inline void
tc_drop_surface_reference(struct pipe_surface * dst)403 tc_drop_surface_reference(struct pipe_surface *dst)
404 {
405 if (pipe_reference(&dst->reference, NULL)) /* only decrement refcount */
406 dst->context->surface_destroy(dst->context, dst);
407 }
408
409 /* Unreference dst but don't touch the dst pointer. */
410 static inline void
tc_drop_so_target_reference(struct pipe_stream_output_target * dst)411 tc_drop_so_target_reference(struct pipe_stream_output_target *dst)
412 {
413 if (pipe_reference(&dst->reference, NULL)) /* only decrement refcount */
414 dst->context->stream_output_target_destroy(dst->context, dst);
415 }
416
417 /**
418 * Subtract the given number of references.
419 */
420 static inline void
tc_drop_vertex_state_references(struct pipe_vertex_state * dst,int num_refs)421 tc_drop_vertex_state_references(struct pipe_vertex_state *dst, int num_refs)
422 {
423 int count = p_atomic_add_return(&dst->reference.count, -num_refs);
424
425 assert(count >= 0);
426 /* Underflows shouldn't happen, but let's be safe. */
427 if (count <= 0)
428 dst->screen->vertex_state_destroy(dst->screen, dst);
429 }
430
431 /* We don't want to read or write min_index and max_index, because
432 * it shouldn't be needed by drivers at this point.
433 */
434 #define DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX \
435 offsetof(struct pipe_draw_info, min_index)
436
437 ALWAYS_INLINE static struct tc_renderpass_info *
incr_rp_info(struct tc_renderpass_info * tc_info)438 incr_rp_info(struct tc_renderpass_info *tc_info)
439 {
440 struct tc_batch_rp_info *info = tc_batch_rp_info(tc_info);
441 return &info[1].info;
442 }
443
444 static void
tc_begin_next_buffer_list(struct threaded_context * tc)445 tc_begin_next_buffer_list(struct threaded_context *tc)
446 {
447 tc->next_buf_list = (tc->next_buf_list + 1) % TC_MAX_BUFFER_LISTS;
448
449 tc->batch_slots[tc->next].buffer_list_index = tc->next_buf_list;
450
451 /* Clear the buffer list in the new empty batch. */
452 struct tc_buffer_list *buf_list = &tc->buffer_lists[tc->next_buf_list];
453 assert(util_queue_fence_is_signalled(&buf_list->driver_flushed_fence));
454 util_queue_fence_reset(&buf_list->driver_flushed_fence); /* set to unsignalled */
455 BITSET_ZERO(buf_list->buffer_list);
456
457 tc->add_all_gfx_bindings_to_buffer_list = true;
458 tc->add_all_compute_bindings_to_buffer_list = true;
459 }
460
461 static void
tc_add_call_end(struct tc_batch * next)462 tc_add_call_end(struct tc_batch *next)
463 {
464 /* Add a dummy last call that won't be executed, but will indicate the end
465 * of the batch. It's for calls that always look at the next call and this
466 * stops them looking farther ahead.
467 */
468 assert(next->num_total_slots < TC_SLOTS_PER_BATCH);
469 struct tc_call_base *call =
470 (struct tc_call_base*)&next->slots[next->num_total_slots];
471 call->call_id = TC_END_BATCH;
472 call->num_slots = 1;
473 #if !defined(NDEBUG) && TC_DEBUG >= 1
474 call->sentinel = TC_SENTINEL;
475 #endif
476 }
477
478 static void
tc_batch_flush(struct threaded_context * tc,bool full_copy)479 tc_batch_flush(struct threaded_context *tc, bool full_copy)
480 {
481 struct tc_batch *next = &tc->batch_slots[tc->next];
482 unsigned next_id = (tc->next + 1) % TC_MAX_BATCHES;
483
484 tc_assert(next->num_total_slots != 0);
485 tc_add_call_end(next);
486
487 tc_batch_check(next);
488 tc_debug_check(tc);
489 tc->bytes_mapped_estimate = 0;
490 tc->bytes_replaced_estimate = 0;
491 p_atomic_add(&tc->num_offloaded_slots, next->num_total_slots);
492
493 if (next->token) {
494 next->token->tc = NULL;
495 tc_unflushed_batch_token_reference(&next->token, NULL);
496 }
497 /* reset renderpass info index for subsequent use */
498 next->renderpass_info_idx = -1;
499
500 /* always increment renderpass info on batch flush;
501 * renderpass info can only be accessed by its owner batch during execution
502 */
503 if (tc->renderpass_info_recording) {
504 tc->batch_slots[next_id].first_set_fb = full_copy;
505 tc_batch_increment_renderpass_info(tc, next_id, full_copy);
506 }
507
508 util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
509 NULL, 0);
510 tc->last = tc->next;
511 tc->next = next_id;
512 if (next_id == 0)
513 tc->batch_generation++;
514 tc_begin_next_buffer_list(tc);
515
516 }
517
518 /* This is the function that adds variable-sized calls into the current
519 * batch. It also flushes the batch if there is not enough space there.
520 * All other higher-level "add" functions use it.
521 */
522 static void *
tc_add_sized_call(struct threaded_context * tc,enum tc_call_id id,unsigned num_slots)523 tc_add_sized_call(struct threaded_context *tc, enum tc_call_id id,
524 unsigned num_slots)
525 {
526 TC_TRACE_SCOPE(id);
527 struct tc_batch *next = &tc->batch_slots[tc->next];
528 assert(num_slots <= TC_SLOTS_PER_BATCH - 1);
529 tc_debug_check(tc);
530
531 if (unlikely(next->num_total_slots + num_slots > TC_SLOTS_PER_BATCH - 1)) {
532 /* copy existing renderpass info during flush */
533 tc_batch_flush(tc, true);
534 next = &tc->batch_slots[tc->next];
535 tc_assert(next->num_total_slots == 0);
536 tc_assert(next->last_mergeable_call == NULL);
537 }
538
539 tc_assert(util_queue_fence_is_signalled(&next->fence));
540
541 struct tc_call_base *call = (struct tc_call_base*)&next->slots[next->num_total_slots];
542 next->num_total_slots += num_slots;
543
544 #if !defined(NDEBUG) && TC_DEBUG >= 1
545 call->sentinel = TC_SENTINEL;
546 #endif
547 call->call_id = id;
548 call->num_slots = num_slots;
549
550 #if TC_DEBUG >= 3
551 tc_printf("ENQUEUE: %s", tc_call_names[id]);
552 #endif
553
554 tc_debug_check(tc);
555 return call;
556 }
557
558 #define tc_add_call(tc, execute, type) \
559 ((struct type*)tc_add_sized_call(tc, execute, call_size(type)))
560
561 #define tc_add_slot_based_call(tc, execute, type, num_slots) \
562 ((struct type*)tc_add_sized_call(tc, execute, \
563 call_size_with_slots(type, num_slots)))
564
565 /* Returns the last mergeable call that was added to the unflushed
566 * batch, or NULL if the address of that call is not currently known
567 * or no such call exists in the unflushed batch.
568 */
569 static struct tc_call_base *
tc_get_last_mergeable_call(struct threaded_context * tc)570 tc_get_last_mergeable_call(struct threaded_context *tc)
571 {
572 struct tc_batch *batch = &tc->batch_slots[tc->next];
573 struct tc_call_base *call = batch->last_mergeable_call;
574
575 tc_assert(call == NULL || call->num_slots <= batch->num_total_slots);
576
577 if (call && (uint64_t *)call == &batch->slots[batch->num_total_slots - call->num_slots])
578 return call;
579 else
580 return NULL;
581 }
582
583 /* Increases the size of the last call in the unflushed batch to the
584 * given number of slots, if possible, without changing the call's data.
585 */
586 static bool
tc_enlarge_last_mergeable_call(struct threaded_context * tc,unsigned desired_num_slots)587 tc_enlarge_last_mergeable_call(struct threaded_context *tc, unsigned desired_num_slots)
588 {
589 struct tc_batch *batch = &tc->batch_slots[tc->next];
590 struct tc_call_base *call = tc_get_last_mergeable_call(tc);
591
592 tc_assert(call);
593 tc_assert(desired_num_slots >= call->num_slots);
594
595 unsigned added_slots = desired_num_slots - call->num_slots;
596
597 if (unlikely(batch->num_total_slots + added_slots > TC_SLOTS_PER_BATCH - 1))
598 return false;
599
600 batch->num_total_slots += added_slots;
601 call->num_slots += added_slots;
602
603 return true;
604 }
605
606 static void
tc_mark_call_mergeable(struct threaded_context * tc,struct tc_call_base * call)607 tc_mark_call_mergeable(struct threaded_context *tc, struct tc_call_base *call)
608 {
609 struct tc_batch *batch = &tc->batch_slots[tc->next];
610 tc_assert(call->num_slots <= batch->num_total_slots);
611 tc_assert((uint64_t *)call == &batch->slots[batch->num_total_slots - call->num_slots]);
612 batch->last_mergeable_call = call;
613 }
614
615 static bool
tc_is_sync(struct threaded_context * tc)616 tc_is_sync(struct threaded_context *tc)
617 {
618 struct tc_batch *last = &tc->batch_slots[tc->last];
619 struct tc_batch *next = &tc->batch_slots[tc->next];
620
621 return util_queue_fence_is_signalled(&last->fence) &&
622 !next->num_total_slots;
623 }
624
625 static void
_tc_sync(struct threaded_context * tc,UNUSED const char * info,UNUSED const char * func)626 _tc_sync(struct threaded_context *tc, UNUSED const char *info, UNUSED const char *func)
627 {
628 struct tc_batch *last = &tc->batch_slots[tc->last];
629 struct tc_batch *next = &tc->batch_slots[tc->next];
630 bool synced = false;
631
632 MESA_TRACE_SCOPE(func);
633
634 tc_debug_check(tc);
635
636 if (tc->options.parse_renderpass_info && tc->in_renderpass && !tc->flushing) {
637 /* corner case: if tc syncs for any reason but a driver flush during a renderpass,
638 * then the current renderpass info MUST be signaled to avoid deadlocking the driver
639 *
640 * this is not a "complete" signal operation, however, as it's unknown what calls may
641 * come after this one, which means that framebuffer attachment data is unreliable
642 *
643 * to avoid erroneously passing bad state to the driver (e.g., allowing zsbuf elimination),
644 * force all attachments active and assume the app was going to get bad perf here anyway
645 */
646 tc_sanitize_renderpass_info(tc);
647 }
648 tc_signal_renderpass_info_ready(tc);
649
650 /* Only wait for queued calls... */
651 if (!util_queue_fence_is_signalled(&last->fence)) {
652 util_queue_fence_wait(&last->fence);
653 synced = true;
654 }
655
656 tc_debug_check(tc);
657
658 if (next->token) {
659 next->token->tc = NULL;
660 tc_unflushed_batch_token_reference(&next->token, NULL);
661 }
662
663 /* .. and execute unflushed calls directly. */
664 if (next->num_total_slots) {
665 p_atomic_add(&tc->num_direct_slots, next->num_total_slots);
666 tc->bytes_mapped_estimate = 0;
667 tc->bytes_replaced_estimate = 0;
668 tc_add_call_end(next);
669 tc_batch_execute(next, NULL, 0);
670 tc_begin_next_buffer_list(tc);
671 synced = true;
672 }
673
674 if (synced) {
675 p_atomic_inc(&tc->num_syncs);
676
677 if (tc_strcmp(func, "tc_destroy") != 0) {
678 tc_printf("sync %s %s", func, info);
679 }
680 }
681
682 tc_debug_check(tc);
683
684 if (tc->options.parse_renderpass_info) {
685 int renderpass_info_idx = next->renderpass_info_idx;
686 if (renderpass_info_idx > 0) {
687 /* don't reset if fb state is unflushed */
688 bool fb_no_draw = tc->seen_fb_state && !tc->renderpass_info_recording->has_draw;
689 uint32_t fb_info = tc->renderpass_info_recording->data32[0];
690 next->renderpass_info_idx = -1;
691 tc_batch_increment_renderpass_info(tc, tc->next, false);
692 if (fb_no_draw)
693 tc->renderpass_info_recording->data32[0] = fb_info;
694 } else if (tc->renderpass_info_recording->has_draw) {
695 tc->renderpass_info_recording->data32[0] = 0;
696 }
697 tc->seen_fb_state = false;
698 tc->query_ended = false;
699 }
700 }
701
702 #define tc_sync(tc) _tc_sync(tc, "", __func__)
703 #define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__)
704
705 /**
706 * Call this from fence_finish for same-context fence waits of deferred fences
707 * that haven't been flushed yet.
708 *
709 * The passed pipe_context must be the one passed to pipe_screen::fence_finish,
710 * i.e., the wrapped one.
711 */
712 void
threaded_context_flush(struct pipe_context * _pipe,struct tc_unflushed_batch_token * token,bool prefer_async)713 threaded_context_flush(struct pipe_context *_pipe,
714 struct tc_unflushed_batch_token *token,
715 bool prefer_async)
716 {
717 struct threaded_context *tc = threaded_context(_pipe);
718
719 /* This is called from the gallium frontend / application thread. */
720 if (token->tc && token->tc == tc) {
721 struct tc_batch *last = &tc->batch_slots[tc->last];
722
723 /* Prefer to do the flush in the driver thread if it is already
724 * running. That should be better for cache locality.
725 */
726 if (prefer_async || !util_queue_fence_is_signalled(&last->fence))
727 tc_batch_flush(tc, false);
728 else
729 tc_sync(token->tc);
730 }
731 }
732
733 static void
tc_add_to_buffer_list(struct tc_buffer_list * next,struct pipe_resource * buf)734 tc_add_to_buffer_list(struct tc_buffer_list *next, struct pipe_resource *buf)
735 {
736 uint32_t id = threaded_resource(buf)->buffer_id_unique;
737 BITSET_SET(next->buffer_list, id & TC_BUFFER_ID_MASK);
738 }
739
740 /* Reset a range of buffer binding slots. */
741 static void
tc_unbind_buffers(uint32_t * binding,unsigned count)742 tc_unbind_buffers(uint32_t *binding, unsigned count)
743 {
744 if (count)
745 memset(binding, 0, sizeof(*binding) * count);
746 }
747
748 static void
tc_add_bindings_to_buffer_list(BITSET_WORD * buffer_list,const uint32_t * bindings,unsigned count)749 tc_add_bindings_to_buffer_list(BITSET_WORD *buffer_list, const uint32_t *bindings,
750 unsigned count)
751 {
752 for (unsigned i = 0; i < count; i++) {
753 if (bindings[i])
754 BITSET_SET(buffer_list, bindings[i] & TC_BUFFER_ID_MASK);
755 }
756 }
757
758 static bool
tc_rebind_bindings(uint32_t old_id,uint32_t new_id,uint32_t * bindings,unsigned count)759 tc_rebind_bindings(uint32_t old_id, uint32_t new_id, uint32_t *bindings,
760 unsigned count)
761 {
762 unsigned rebind_count = 0;
763
764 for (unsigned i = 0; i < count; i++) {
765 if (bindings[i] == old_id) {
766 bindings[i] = new_id;
767 rebind_count++;
768 }
769 }
770 return rebind_count;
771 }
772
773 static void
tc_add_shader_bindings_to_buffer_list(struct threaded_context * tc,BITSET_WORD * buffer_list,enum pipe_shader_type shader)774 tc_add_shader_bindings_to_buffer_list(struct threaded_context *tc,
775 BITSET_WORD *buffer_list,
776 enum pipe_shader_type shader)
777 {
778 tc_add_bindings_to_buffer_list(buffer_list, tc->const_buffers[shader],
779 tc->max_const_buffers);
780 if (tc->seen_shader_buffers[shader]) {
781 tc_add_bindings_to_buffer_list(buffer_list, tc->shader_buffers[shader],
782 tc->max_shader_buffers);
783 }
784 if (tc->seen_image_buffers[shader]) {
785 tc_add_bindings_to_buffer_list(buffer_list, tc->image_buffers[shader],
786 tc->max_images);
787 }
788 if (tc->seen_sampler_buffers[shader]) {
789 tc_add_bindings_to_buffer_list(buffer_list, tc->sampler_buffers[shader],
790 tc->max_samplers);
791 }
792 }
793
794 static unsigned
tc_rebind_shader_bindings(struct threaded_context * tc,uint32_t old_id,uint32_t new_id,enum pipe_shader_type shader,uint32_t * rebind_mask)795 tc_rebind_shader_bindings(struct threaded_context *tc, uint32_t old_id,
796 uint32_t new_id, enum pipe_shader_type shader, uint32_t *rebind_mask)
797 {
798 unsigned ubo = 0, ssbo = 0, img = 0, sampler = 0;
799
800 ubo = tc_rebind_bindings(old_id, new_id, tc->const_buffers[shader],
801 tc->max_const_buffers);
802 if (ubo)
803 *rebind_mask |= BITFIELD_BIT(TC_BINDING_UBO_VS) << shader;
804 if (tc->seen_shader_buffers[shader]) {
805 ssbo = tc_rebind_bindings(old_id, new_id, tc->shader_buffers[shader],
806 tc->max_shader_buffers);
807 if (ssbo)
808 *rebind_mask |= BITFIELD_BIT(TC_BINDING_SSBO_VS) << shader;
809 }
810 if (tc->seen_image_buffers[shader]) {
811 img = tc_rebind_bindings(old_id, new_id, tc->image_buffers[shader],
812 tc->max_images);
813 if (img)
814 *rebind_mask |= BITFIELD_BIT(TC_BINDING_IMAGE_VS) << shader;
815 }
816 if (tc->seen_sampler_buffers[shader]) {
817 sampler = tc_rebind_bindings(old_id, new_id, tc->sampler_buffers[shader],
818 tc->max_samplers);
819 if (sampler)
820 *rebind_mask |= BITFIELD_BIT(TC_BINDING_SAMPLERVIEW_VS) << shader;
821 }
822 return ubo + ssbo + img + sampler;
823 }
824
825 /* Add all bound buffers used by VS/TCS/TES/GS/FS to the buffer list.
826 * This is called by the first draw call in a batch when we want to inherit
827 * all bindings set by the previous batch.
828 */
829 static void
tc_add_all_gfx_bindings_to_buffer_list(struct threaded_context * tc)830 tc_add_all_gfx_bindings_to_buffer_list(struct threaded_context *tc)
831 {
832 BITSET_WORD *buffer_list = tc->buffer_lists[tc->next_buf_list].buffer_list;
833
834 tc_add_bindings_to_buffer_list(buffer_list, tc->vertex_buffers, tc->num_vertex_buffers);
835 if (tc->seen_streamout_buffers)
836 tc_add_bindings_to_buffer_list(buffer_list, tc->streamout_buffers, PIPE_MAX_SO_BUFFERS);
837
838 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_VERTEX);
839 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_FRAGMENT);
840
841 if (tc->seen_tcs)
842 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_TESS_CTRL);
843 if (tc->seen_tes)
844 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_TESS_EVAL);
845 if (tc->seen_gs)
846 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_GEOMETRY);
847
848 tc->add_all_gfx_bindings_to_buffer_list = false;
849 }
850
851 /* Add all bound buffers used by compute to the buffer list.
852 * This is called by the first compute call in a batch when we want to inherit
853 * all bindings set by the previous batch.
854 */
855 static void
tc_add_all_compute_bindings_to_buffer_list(struct threaded_context * tc)856 tc_add_all_compute_bindings_to_buffer_list(struct threaded_context *tc)
857 {
858 BITSET_WORD *buffer_list = tc->buffer_lists[tc->next_buf_list].buffer_list;
859
860 tc_add_shader_bindings_to_buffer_list(tc, buffer_list, PIPE_SHADER_COMPUTE);
861 tc->add_all_compute_bindings_to_buffer_list = false;
862 }
863
864 static unsigned
tc_rebind_buffer(struct threaded_context * tc,uint32_t old_id,uint32_t new_id,uint32_t * rebind_mask)865 tc_rebind_buffer(struct threaded_context *tc, uint32_t old_id, uint32_t new_id, uint32_t *rebind_mask)
866 {
867 unsigned vbo = 0, so = 0;
868
869 vbo = tc_rebind_bindings(old_id, new_id, tc->vertex_buffers,
870 tc->num_vertex_buffers);
871 if (vbo)
872 *rebind_mask |= BITFIELD_BIT(TC_BINDING_VERTEX_BUFFER);
873
874 if (tc->seen_streamout_buffers) {
875 so = tc_rebind_bindings(old_id, new_id, tc->streamout_buffers,
876 PIPE_MAX_SO_BUFFERS);
877 if (so)
878 *rebind_mask |= BITFIELD_BIT(TC_BINDING_STREAMOUT_BUFFER);
879 }
880 unsigned rebound = vbo + so;
881
882 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_VERTEX, rebind_mask);
883 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_FRAGMENT, rebind_mask);
884
885 if (tc->seen_tcs)
886 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_TESS_CTRL, rebind_mask);
887 if (tc->seen_tes)
888 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_TESS_EVAL, rebind_mask);
889 if (tc->seen_gs)
890 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_GEOMETRY, rebind_mask);
891
892 rebound += tc_rebind_shader_bindings(tc, old_id, new_id, PIPE_SHADER_COMPUTE, rebind_mask);
893
894 if (rebound)
895 BITSET_SET(tc->buffer_lists[tc->next_buf_list].buffer_list, new_id & TC_BUFFER_ID_MASK);
896 return rebound;
897 }
898
899 static bool
tc_is_buffer_bound_with_mask(uint32_t id,uint32_t * bindings,unsigned binding_mask)900 tc_is_buffer_bound_with_mask(uint32_t id, uint32_t *bindings, unsigned binding_mask)
901 {
902 while (binding_mask) {
903 if (bindings[u_bit_scan(&binding_mask)] == id)
904 return true;
905 }
906 return false;
907 }
908
909 static bool
tc_is_buffer_shader_bound_for_write(struct threaded_context * tc,uint32_t id,enum pipe_shader_type shader)910 tc_is_buffer_shader_bound_for_write(struct threaded_context *tc, uint32_t id,
911 enum pipe_shader_type shader)
912 {
913 if (tc->seen_shader_buffers[shader] &&
914 tc_is_buffer_bound_with_mask(id, tc->shader_buffers[shader],
915 tc->shader_buffers_writeable_mask[shader]))
916 return true;
917
918 if (tc->seen_image_buffers[shader] &&
919 tc_is_buffer_bound_with_mask(id, tc->image_buffers[shader],
920 tc->image_buffers_writeable_mask[shader]))
921 return true;
922
923 return false;
924 }
925
926 static bool
tc_is_buffer_bound_for_write(struct threaded_context * tc,uint32_t id)927 tc_is_buffer_bound_for_write(struct threaded_context *tc, uint32_t id)
928 {
929 if (tc->seen_streamout_buffers &&
930 tc_is_buffer_bound_with_mask(id, tc->streamout_buffers,
931 BITFIELD_MASK(PIPE_MAX_SO_BUFFERS)))
932 return true;
933
934 if (tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_VERTEX) ||
935 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_FRAGMENT) ||
936 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_COMPUTE))
937 return true;
938
939 if (tc->seen_tcs &&
940 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_TESS_CTRL))
941 return true;
942
943 if (tc->seen_tes &&
944 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_TESS_EVAL))
945 return true;
946
947 if (tc->seen_gs &&
948 tc_is_buffer_shader_bound_for_write(tc, id, PIPE_SHADER_GEOMETRY))
949 return true;
950
951 return false;
952 }
953
954 static bool
tc_is_buffer_busy(struct threaded_context * tc,struct threaded_resource * tbuf,unsigned map_usage)955 tc_is_buffer_busy(struct threaded_context *tc, struct threaded_resource *tbuf,
956 unsigned map_usage)
957 {
958 if (!tc->options.is_resource_busy)
959 return true;
960
961 uint32_t id_hash = tbuf->buffer_id_unique & TC_BUFFER_ID_MASK;
962
963 for (unsigned i = 0; i < TC_MAX_BUFFER_LISTS; i++) {
964 struct tc_buffer_list *buf_list = &tc->buffer_lists[i];
965
966 /* If the buffer is referenced by a batch that hasn't been flushed (by tc or the driver),
967 * then the buffer is considered busy. */
968 if (!util_queue_fence_is_signalled(&buf_list->driver_flushed_fence) &&
969 BITSET_TEST(buf_list->buffer_list, id_hash))
970 return true;
971 }
972
973 /* The buffer isn't referenced by any unflushed batch: we can safely ask to the driver whether
974 * this buffer is busy or not. */
975 return tc->options.is_resource_busy(tc->pipe->screen, tbuf->latest, map_usage);
976 }
977
978 /**
979 * allow_cpu_storage should be false for user memory and imported buffers.
980 */
981 void
threaded_resource_init(struct pipe_resource * res,bool allow_cpu_storage)982 threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage)
983 {
984 struct threaded_resource *tres = threaded_resource(res);
985
986 tres->latest = &tres->b;
987 tres->cpu_storage = NULL;
988 util_range_init(&tres->valid_buffer_range);
989 tres->is_shared = false;
990 tres->is_user_ptr = false;
991 tres->buffer_id_unique = 0;
992 tres->pending_staging_uploads = 0;
993 tres->last_batch_usage = -1;
994 util_range_init(&tres->pending_staging_uploads_range);
995
996 if (allow_cpu_storage &&
997 !(res->flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
998 PIPE_RESOURCE_FLAG_SPARSE |
999 PIPE_RESOURCE_FLAG_ENCRYPTED)) &&
1000 /* We need buffer invalidation and buffer busyness tracking for the CPU
1001 * storage, which aren't supported with pipe_vertex_state. */
1002 !(res->bind & PIPE_BIND_VERTEX_STATE))
1003 tres->allow_cpu_storage = true;
1004 else
1005 tres->allow_cpu_storage = false;
1006 }
1007
1008 void
threaded_resource_deinit(struct pipe_resource * res)1009 threaded_resource_deinit(struct pipe_resource *res)
1010 {
1011 struct threaded_resource *tres = threaded_resource(res);
1012
1013 if (tres->latest != &tres->b)
1014 pipe_resource_reference(&tres->latest, NULL);
1015 util_range_destroy(&tres->valid_buffer_range);
1016 util_range_destroy(&tres->pending_staging_uploads_range);
1017 align_free(tres->cpu_storage);
1018 }
1019
1020 struct pipe_context *
threaded_context_unwrap_sync(struct pipe_context * pipe)1021 threaded_context_unwrap_sync(struct pipe_context *pipe)
1022 {
1023 if (!pipe || !pipe->priv)
1024 return pipe;
1025
1026 tc_sync(threaded_context(pipe));
1027 return (struct pipe_context*)pipe->priv;
1028 }
1029
1030
1031 /********************************************************************
1032 * simple functions
1033 */
1034
1035 #define TC_FUNC1(func, qualifier, type, deref, addr, ...) \
1036 struct tc_call_##func { \
1037 struct tc_call_base base; \
1038 type state; \
1039 }; \
1040 \
1041 static uint16_t ALWAYS_INLINE \
1042 tc_call_##func(struct pipe_context *pipe, void *call) \
1043 { \
1044 pipe->func(pipe, addr(to_call(call, tc_call_##func)->state)); \
1045 return call_size(tc_call_##func); \
1046 } \
1047 \
1048 static void \
1049 tc_##func(struct pipe_context *_pipe, qualifier type deref param) \
1050 { \
1051 struct threaded_context *tc = threaded_context(_pipe); \
1052 struct tc_call_##func *p = (struct tc_call_##func*) \
1053 tc_add_call(tc, TC_CALL_##func, tc_call_##func); \
1054 p->state = deref(param); \
1055 __VA_ARGS__; \
1056 }
1057
1058 TC_FUNC1(set_active_query_state, , bool, , )
1059
1060 TC_FUNC1(set_blend_color, const, struct pipe_blend_color, *, &)
1061 TC_FUNC1(set_stencil_ref, const, struct pipe_stencil_ref, , )
1062 TC_FUNC1(set_clip_state, const, struct pipe_clip_state, *, &)
1063 TC_FUNC1(set_sample_mask, , unsigned, , )
1064 TC_FUNC1(set_min_samples, , unsigned, , )
1065 TC_FUNC1(set_polygon_stipple, const, struct pipe_poly_stipple, *, &)
1066
1067 TC_FUNC1(texture_barrier, , unsigned, , )
1068 TC_FUNC1(memory_barrier, , unsigned, , )
1069 TC_FUNC1(delete_texture_handle, , uint64_t, , )
1070 TC_FUNC1(delete_image_handle, , uint64_t, , )
1071 TC_FUNC1(set_frontend_noop, , bool, , )
1072
1073
1074 /********************************************************************
1075 * queries
1076 */
1077
1078 static struct pipe_query *
tc_create_query(struct pipe_context * _pipe,unsigned query_type,unsigned index)1079 tc_create_query(struct pipe_context *_pipe, unsigned query_type,
1080 unsigned index)
1081 {
1082 struct threaded_context *tc = threaded_context(_pipe);
1083 struct pipe_context *pipe = tc->pipe;
1084
1085 return pipe->create_query(pipe, query_type, index);
1086 }
1087
1088 static struct pipe_query *
tc_create_batch_query(struct pipe_context * _pipe,unsigned num_queries,unsigned * query_types)1089 tc_create_batch_query(struct pipe_context *_pipe, unsigned num_queries,
1090 unsigned *query_types)
1091 {
1092 struct threaded_context *tc = threaded_context(_pipe);
1093 struct pipe_context *pipe = tc->pipe;
1094
1095 return pipe->create_batch_query(pipe, num_queries, query_types);
1096 }
1097
1098 struct tc_query_call {
1099 struct tc_call_base base;
1100 struct pipe_query *query;
1101 };
1102
1103 static uint16_t ALWAYS_INLINE
tc_call_destroy_query(struct pipe_context * pipe,void * call)1104 tc_call_destroy_query(struct pipe_context *pipe, void *call)
1105 {
1106 struct pipe_query *query = to_call(call, tc_query_call)->query;
1107 struct threaded_query *tq = threaded_query(query);
1108
1109 if (list_is_linked(&tq->head_unflushed))
1110 list_del(&tq->head_unflushed);
1111
1112 pipe->destroy_query(pipe, query);
1113 return call_size(tc_query_call);
1114 }
1115
1116 static void
tc_destroy_query(struct pipe_context * _pipe,struct pipe_query * query)1117 tc_destroy_query(struct pipe_context *_pipe, struct pipe_query *query)
1118 {
1119 struct threaded_context *tc = threaded_context(_pipe);
1120
1121 tc_add_call(tc, TC_CALL_destroy_query, tc_query_call)->query = query;
1122 }
1123
1124 static uint16_t ALWAYS_INLINE
tc_call_begin_query(struct pipe_context * pipe,void * call)1125 tc_call_begin_query(struct pipe_context *pipe, void *call)
1126 {
1127 pipe->begin_query(pipe, to_call(call, tc_query_call)->query);
1128 return call_size(tc_query_call);
1129 }
1130
1131 static bool
tc_begin_query(struct pipe_context * _pipe,struct pipe_query * query)1132 tc_begin_query(struct pipe_context *_pipe, struct pipe_query *query)
1133 {
1134 struct threaded_context *tc = threaded_context(_pipe);
1135 tc->num_queries_active++;
1136
1137 tc_add_call(tc, TC_CALL_begin_query, tc_query_call)->query = query;
1138 return true; /* we don't care about the return value for this call */
1139 }
1140
1141 struct tc_end_query_call {
1142 struct tc_call_base base;
1143 struct threaded_context *tc;
1144 struct pipe_query *query;
1145 };
1146
1147 static uint16_t ALWAYS_INLINE
tc_call_end_query(struct pipe_context * pipe,void * call)1148 tc_call_end_query(struct pipe_context *pipe, void *call)
1149 {
1150 struct tc_end_query_call *p = to_call(call, tc_end_query_call);
1151 struct threaded_query *tq = threaded_query(p->query);
1152
1153 if (!list_is_linked(&tq->head_unflushed))
1154 list_add(&tq->head_unflushed, &p->tc->unflushed_queries);
1155
1156 pipe->end_query(pipe, p->query);
1157 return call_size(tc_end_query_call);
1158 }
1159
1160 static bool
tc_end_query(struct pipe_context * _pipe,struct pipe_query * query)1161 tc_end_query(struct pipe_context *_pipe, struct pipe_query *query)
1162 {
1163 struct threaded_context *tc = threaded_context(_pipe);
1164 struct threaded_query *tq = threaded_query(query);
1165 struct tc_end_query_call *call =
1166 tc_add_call(tc, TC_CALL_end_query, tc_end_query_call);
1167 tc->num_queries_active--;
1168
1169 call->tc = tc;
1170 call->query = query;
1171
1172 tq->flushed = false;
1173 tc->query_ended = true;
1174
1175 return true; /* we don't care about the return value for this call */
1176 }
1177
1178 static bool
tc_get_query_result(struct pipe_context * _pipe,struct pipe_query * query,bool wait,union pipe_query_result * result)1179 tc_get_query_result(struct pipe_context *_pipe,
1180 struct pipe_query *query, bool wait,
1181 union pipe_query_result *result)
1182 {
1183 struct threaded_context *tc = threaded_context(_pipe);
1184 struct threaded_query *tq = threaded_query(query);
1185 struct pipe_context *pipe = tc->pipe;
1186 bool flushed = tq->flushed;
1187
1188 if (!flushed) {
1189 tc_sync_msg(tc, wait ? "wait" : "nowait");
1190 tc_set_driver_thread(tc);
1191 }
1192
1193 bool success = pipe->get_query_result(pipe, query, wait, result);
1194
1195 if (!flushed)
1196 tc_clear_driver_thread(tc);
1197
1198 if (success) {
1199 tq->flushed = true;
1200 if (list_is_linked(&tq->head_unflushed)) {
1201 /* This is safe because it can only happen after we sync'd. */
1202 list_del(&tq->head_unflushed);
1203 }
1204 }
1205 return success;
1206 }
1207
1208 struct tc_query_result_resource {
1209 struct tc_call_base base;
1210 enum pipe_query_flags flags:8;
1211 enum pipe_query_value_type result_type:8;
1212 int8_t index; /* it can be -1 */
1213 unsigned offset;
1214 struct pipe_query *query;
1215 struct pipe_resource *resource;
1216 };
1217
1218 static uint16_t ALWAYS_INLINE
tc_call_get_query_result_resource(struct pipe_context * pipe,void * call)1219 tc_call_get_query_result_resource(struct pipe_context *pipe, void *call)
1220 {
1221 struct tc_query_result_resource *p = to_call(call, tc_query_result_resource);
1222
1223 pipe->get_query_result_resource(pipe, p->query, p->flags, p->result_type,
1224 p->index, p->resource, p->offset);
1225 tc_drop_resource_reference(p->resource);
1226 return call_size(tc_query_result_resource);
1227 }
1228
1229 static void
tc_get_query_result_resource(struct pipe_context * _pipe,struct pipe_query * query,enum pipe_query_flags flags,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1230 tc_get_query_result_resource(struct pipe_context *_pipe,
1231 struct pipe_query *query,
1232 enum pipe_query_flags flags,
1233 enum pipe_query_value_type result_type, int index,
1234 struct pipe_resource *resource, unsigned offset)
1235 {
1236 struct threaded_context *tc = threaded_context(_pipe);
1237
1238 tc_buffer_disable_cpu_storage(resource);
1239
1240 struct tc_query_result_resource *p =
1241 tc_add_call(tc, TC_CALL_get_query_result_resource,
1242 tc_query_result_resource);
1243 p->query = query;
1244 p->flags = flags;
1245 p->result_type = result_type;
1246 p->index = index;
1247 tc_set_resource_reference(&p->resource, resource);
1248 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], resource);
1249 p->offset = offset;
1250 }
1251
1252 struct tc_render_condition {
1253 struct tc_call_base base;
1254 bool condition;
1255 unsigned mode;
1256 struct pipe_query *query;
1257 };
1258
1259 static uint16_t ALWAYS_INLINE
tc_call_render_condition(struct pipe_context * pipe,void * call)1260 tc_call_render_condition(struct pipe_context *pipe, void *call)
1261 {
1262 struct tc_render_condition *p = to_call(call, tc_render_condition);
1263 pipe->render_condition(pipe, p->query, p->condition, p->mode);
1264 return call_size(tc_render_condition);
1265 }
1266
1267 static void
tc_render_condition(struct pipe_context * _pipe,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1268 tc_render_condition(struct pipe_context *_pipe,
1269 struct pipe_query *query, bool condition,
1270 enum pipe_render_cond_flag mode)
1271 {
1272 struct threaded_context *tc = threaded_context(_pipe);
1273 struct tc_render_condition *p =
1274 tc_add_call(tc, TC_CALL_render_condition, tc_render_condition);
1275
1276 p->query = query;
1277 p->condition = condition;
1278 p->mode = mode;
1279 }
1280
1281
1282 /********************************************************************
1283 * constant (immutable) states
1284 */
1285
1286 #define TC_CSO_CREATE(name, sname) \
1287 static void * \
1288 tc_create_##name##_state(struct pipe_context *_pipe, \
1289 const struct pipe_##sname##_state *state) \
1290 { \
1291 struct pipe_context *pipe = threaded_context(_pipe)->pipe; \
1292 return pipe->create_##name##_state(pipe, state); \
1293 }
1294
1295 #define TC_CSO_BIND(name, ...) TC_FUNC1(bind_##name##_state, , void *, , , ##__VA_ARGS__)
1296 #define TC_CSO_DELETE(name) TC_FUNC1(delete_##name##_state, , void *, , )
1297
1298 #define TC_CSO(name, sname, ...) \
1299 TC_CSO_CREATE(name, sname) \
1300 TC_CSO_BIND(name, ##__VA_ARGS__) \
1301 TC_CSO_DELETE(name)
1302
1303 #define TC_CSO_WHOLE(name) TC_CSO(name, name)
1304 #define TC_CSO_SHADER(name) TC_CSO(name, shader)
1305 #define TC_CSO_SHADER_TRACK(name) TC_CSO(name, shader, tc->seen_##name = true;)
1306
1307 TC_CSO_WHOLE(blend)
TC_CSO_WHOLE(rasterizer)1308 TC_CSO_WHOLE(rasterizer)
1309 TC_CSO_CREATE(depth_stencil_alpha, depth_stencil_alpha)
1310 TC_CSO_BIND(depth_stencil_alpha,
1311 if (param && tc->options.parse_renderpass_info) {
1312 /* dsa info is only ever added during a renderpass;
1313 * changes outside of a renderpass reset the data
1314 */
1315 if (!tc->in_renderpass) {
1316 tc_get_renderpass_info(tc)->zsbuf_write_dsa = 0;
1317 tc_get_renderpass_info(tc)->zsbuf_read_dsa = 0;
1318 }
1319 /* let the driver parse its own state */
1320 tc->options.dsa_parse(param, tc_get_renderpass_info(tc));
1321 }
1322 )
1323 TC_CSO_DELETE(depth_stencil_alpha)
1324 TC_CSO_WHOLE(compute)
1325 TC_CSO_CREATE(fs, shader)
1326 TC_CSO_BIND(fs,
1327 if (param && tc->options.parse_renderpass_info) {
1328 /* fs info is only ever added during a renderpass;
1329 * changes outside of a renderpass reset the data
1330 */
1331 if (!tc->in_renderpass) {
1332 tc_get_renderpass_info(tc)->cbuf_fbfetch = 0;
1333 tc_get_renderpass_info(tc)->zsbuf_write_fs = 0;
1334 }
1335 /* let the driver parse its own state */
1336 tc->options.fs_parse(param, tc_get_renderpass_info(tc));
1337 }
1338 )
1339 TC_CSO_DELETE(fs)
1340 TC_CSO_SHADER(vs)
1341 TC_CSO_SHADER_TRACK(gs)
1342 TC_CSO_SHADER_TRACK(tcs)
1343 TC_CSO_SHADER_TRACK(tes)
1344 TC_CSO_CREATE(sampler, sampler)
1345 TC_CSO_DELETE(sampler)
1346 TC_CSO_BIND(vertex_elements)
1347 TC_CSO_DELETE(vertex_elements)
1348
1349 static void *
1350 tc_create_vertex_elements_state(struct pipe_context *_pipe, unsigned count,
1351 const struct pipe_vertex_element *elems)
1352 {
1353 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
1354
1355 return pipe->create_vertex_elements_state(pipe, count, elems);
1356 }
1357
1358 struct tc_sampler_states {
1359 struct tc_call_base base;
1360 uint8_t shader, start, count;
1361 void *slot[0]; /* more will be allocated if needed */
1362 };
1363
1364 static uint16_t ALWAYS_INLINE
tc_call_bind_sampler_states(struct pipe_context * pipe,void * call)1365 tc_call_bind_sampler_states(struct pipe_context *pipe, void *call)
1366 {
1367 struct tc_sampler_states *p = (struct tc_sampler_states *)call;
1368
1369 pipe->bind_sampler_states(pipe, p->shader, p->start, p->count, p->slot);
1370 return p->base.num_slots;
1371 }
1372
1373 static void
tc_bind_sampler_states(struct pipe_context * _pipe,enum pipe_shader_type shader,unsigned start,unsigned count,void ** states)1374 tc_bind_sampler_states(struct pipe_context *_pipe,
1375 enum pipe_shader_type shader,
1376 unsigned start, unsigned count, void **states)
1377 {
1378 if (!count)
1379 return;
1380
1381 struct threaded_context *tc = threaded_context(_pipe);
1382 struct tc_sampler_states *p =
1383 tc_add_slot_based_call(tc, TC_CALL_bind_sampler_states, tc_sampler_states, count);
1384
1385 p->shader = shader;
1386 p->start = start;
1387 p->count = count;
1388 memcpy(p->slot, states, count * sizeof(states[0]));
1389 }
1390
1391 static void
tc_link_shader(struct pipe_context * _pipe,void ** shaders)1392 tc_link_shader(struct pipe_context *_pipe, void **shaders)
1393 {
1394 struct threaded_context *tc = threaded_context(_pipe);
1395 tc->pipe->link_shader(tc->pipe, shaders);
1396 }
1397 /********************************************************************
1398 * immediate states
1399 */
1400
1401 struct tc_framebuffer {
1402 struct tc_call_base base;
1403 struct pipe_framebuffer_state state;
1404 };
1405
1406 static uint16_t ALWAYS_INLINE
tc_call_set_framebuffer_state(struct pipe_context * pipe,void * call)1407 tc_call_set_framebuffer_state(struct pipe_context *pipe, void *call)
1408 {
1409 struct pipe_framebuffer_state *p = &to_call(call, tc_framebuffer)->state;
1410
1411 pipe->set_framebuffer_state(pipe, p);
1412
1413 unsigned nr_cbufs = p->nr_cbufs;
1414 for (unsigned i = 0; i < nr_cbufs; i++)
1415 tc_drop_surface_reference(p->cbufs[i]);
1416 tc_drop_surface_reference(p->zsbuf);
1417 tc_drop_resource_reference(p->resolve);
1418 return call_size(tc_framebuffer);
1419 }
1420
1421 static void
tc_set_framebuffer_state(struct pipe_context * _pipe,const struct pipe_framebuffer_state * fb)1422 tc_set_framebuffer_state(struct pipe_context *_pipe,
1423 const struct pipe_framebuffer_state *fb)
1424 {
1425 struct threaded_context *tc = threaded_context(_pipe);
1426 struct tc_framebuffer *p =
1427 tc_add_call(tc, TC_CALL_set_framebuffer_state, tc_framebuffer);
1428 unsigned nr_cbufs = fb->nr_cbufs;
1429
1430 p->state.width = fb->width;
1431 p->state.height = fb->height;
1432 p->state.samples = fb->samples;
1433 p->state.layers = fb->layers;
1434 p->state.nr_cbufs = nr_cbufs;
1435 p->state.viewmask = fb->viewmask;
1436
1437 /* when unbinding, mark attachments as used for the current batch */
1438 for (unsigned i = 0; i < tc->nr_cbufs; i++) {
1439 tc_set_resource_batch_usage_persistent(tc, tc->fb_resources[i], false);
1440 pipe_resource_reference(&tc->fb_resources[i], NULL);
1441 }
1442 tc_set_resource_batch_usage_persistent(tc, tc->fb_resources[PIPE_MAX_COLOR_BUFS], false);
1443 tc_set_resource_batch_usage_persistent(tc, tc->fb_resolve, false);
1444
1445 for (unsigned i = 0; i < nr_cbufs; i++) {
1446 p->state.cbufs[i] = NULL;
1447 pipe_surface_reference(&p->state.cbufs[i], fb->cbufs[i]);
1448 /* full tracking requires storing the fb attachment resources */
1449 if (fb->cbufs[i])
1450 pipe_resource_reference(&tc->fb_resources[i], fb->cbufs[i]->texture);
1451 tc_set_resource_batch_usage_persistent(tc, tc->fb_resources[i], true);
1452 }
1453 tc->nr_cbufs = nr_cbufs;
1454 if (tc->options.parse_renderpass_info) {
1455 /* ensure this is treated as the first fb set if no fb activity has occurred */
1456 if (!tc->renderpass_info_recording->has_draw &&
1457 !tc->renderpass_info_recording->cbuf_clear &&
1458 !tc->renderpass_info_recording->cbuf_load &&
1459 !tc->renderpass_info_recording->zsbuf_load &&
1460 !tc->renderpass_info_recording->zsbuf_clear_partial)
1461 tc->batch_slots[tc->next].first_set_fb = false;
1462 /* store existing zsbuf data for possible persistence */
1463 uint8_t zsbuf = tc->renderpass_info_recording->has_draw ?
1464 0 :
1465 tc->renderpass_info_recording->data8[3];
1466 bool zsbuf_changed = tc->fb_resources[PIPE_MAX_COLOR_BUFS] !=
1467 (fb->zsbuf ? fb->zsbuf->texture : NULL);
1468
1469 if (tc->seen_fb_state) {
1470 /* this is the end of a renderpass, so increment the renderpass info */
1471 tc_batch_increment_renderpass_info(tc, tc->next, false);
1472 /* if zsbuf hasn't changed (i.e., possibly just adding a color buffer):
1473 * keep zsbuf usage data
1474 */
1475 if (!zsbuf_changed)
1476 tc->renderpass_info_recording->data8[3] = zsbuf;
1477 } else {
1478 /* this is the first time a set_framebuffer_call is triggered;
1479 * just increment the index and keep using the existing info for recording
1480 */
1481 tc->batch_slots[tc->next].renderpass_info_idx = 0;
1482 }
1483 /* future fb state changes will increment the index */
1484 tc->seen_fb_state = true;
1485 }
1486 pipe_resource_reference(&tc->fb_resources[PIPE_MAX_COLOR_BUFS],
1487 fb->zsbuf ? fb->zsbuf->texture : NULL);
1488 pipe_resource_reference(&tc->fb_resolve, fb->resolve);
1489 tc_set_resource_batch_usage_persistent(tc, tc->fb_resources[PIPE_MAX_COLOR_BUFS], true);
1490 tc_set_resource_batch_usage_persistent(tc, tc->fb_resolve, true);
1491 tc->in_renderpass = false;
1492 p->state.zsbuf = NULL;
1493 pipe_surface_reference(&p->state.zsbuf, fb->zsbuf);
1494 p->state.resolve = NULL;
1495 pipe_resource_reference(&p->state.resolve, fb->resolve);
1496 }
1497
1498 struct tc_tess_state {
1499 struct tc_call_base base;
1500 float state[6];
1501 };
1502
1503 static uint16_t ALWAYS_INLINE
tc_call_set_tess_state(struct pipe_context * pipe,void * call)1504 tc_call_set_tess_state(struct pipe_context *pipe, void *call)
1505 {
1506 float *p = to_call(call, tc_tess_state)->state;
1507
1508 pipe->set_tess_state(pipe, p, p + 4);
1509 return call_size(tc_tess_state);
1510 }
1511
1512 static void
tc_set_tess_state(struct pipe_context * _pipe,const float default_outer_level[4],const float default_inner_level[2])1513 tc_set_tess_state(struct pipe_context *_pipe,
1514 const float default_outer_level[4],
1515 const float default_inner_level[2])
1516 {
1517 struct threaded_context *tc = threaded_context(_pipe);
1518 float *p = tc_add_call(tc, TC_CALL_set_tess_state, tc_tess_state)->state;
1519
1520 memcpy(p, default_outer_level, 4 * sizeof(float));
1521 memcpy(p + 4, default_inner_level, 2 * sizeof(float));
1522 }
1523
1524 struct tc_patch_vertices {
1525 struct tc_call_base base;
1526 uint8_t patch_vertices;
1527 };
1528
1529 static uint16_t ALWAYS_INLINE
tc_call_set_patch_vertices(struct pipe_context * pipe,void * call)1530 tc_call_set_patch_vertices(struct pipe_context *pipe, void *call)
1531 {
1532 uint8_t patch_vertices = to_call(call, tc_patch_vertices)->patch_vertices;
1533
1534 pipe->set_patch_vertices(pipe, patch_vertices);
1535 return call_size(tc_patch_vertices);
1536 }
1537
1538 static void
tc_set_patch_vertices(struct pipe_context * _pipe,uint8_t patch_vertices)1539 tc_set_patch_vertices(struct pipe_context *_pipe, uint8_t patch_vertices)
1540 {
1541 struct threaded_context *tc = threaded_context(_pipe);
1542
1543 tc_add_call(tc, TC_CALL_set_patch_vertices,
1544 tc_patch_vertices)->patch_vertices = patch_vertices;
1545 }
1546
1547 struct tc_constant_buffer_base {
1548 struct tc_call_base base;
1549 uint8_t shader, index;
1550 bool is_null;
1551 };
1552
1553 struct tc_constant_buffer {
1554 struct tc_constant_buffer_base base;
1555 struct pipe_constant_buffer cb;
1556 };
1557
1558 static uint16_t ALWAYS_INLINE
tc_call_set_constant_buffer(struct pipe_context * pipe,void * call)1559 tc_call_set_constant_buffer(struct pipe_context *pipe, void *call)
1560 {
1561 struct tc_constant_buffer *p = (struct tc_constant_buffer *)call;
1562
1563 if (unlikely(p->base.is_null)) {
1564 pipe->set_constant_buffer(pipe, p->base.shader, p->base.index, false, NULL);
1565 return call_size(tc_constant_buffer_base);
1566 }
1567
1568 pipe->set_constant_buffer(pipe, p->base.shader, p->base.index, true, &p->cb);
1569 return call_size(tc_constant_buffer);
1570 }
1571
1572 static void
tc_set_constant_buffer(struct pipe_context * _pipe,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)1573 tc_set_constant_buffer(struct pipe_context *_pipe,
1574 enum pipe_shader_type shader, uint index,
1575 bool take_ownership,
1576 const struct pipe_constant_buffer *cb)
1577 {
1578 struct threaded_context *tc = threaded_context(_pipe);
1579
1580 if (unlikely(!cb || (!cb->buffer && !cb->user_buffer))) {
1581 struct tc_constant_buffer_base *p =
1582 tc_add_call(tc, TC_CALL_set_constant_buffer, tc_constant_buffer_base);
1583 p->shader = shader;
1584 p->index = index;
1585 p->is_null = true;
1586 tc_unbind_buffer(&tc->const_buffers[shader][index]);
1587 return;
1588 }
1589
1590 struct pipe_resource *buffer;
1591 unsigned offset;
1592
1593 if (cb->user_buffer) {
1594 /* This must be done before adding set_constant_buffer, because it could
1595 * generate e.g. transfer_unmap and flush partially-uninitialized
1596 * set_constant_buffer to the driver if it was done afterwards.
1597 */
1598 buffer = NULL;
1599 u_upload_data(tc->base.const_uploader, 0, cb->buffer_size,
1600 tc->ubo_alignment, cb->user_buffer, &offset, &buffer);
1601 u_upload_unmap(tc->base.const_uploader);
1602 take_ownership = true;
1603 } else {
1604 buffer = cb->buffer;
1605 offset = cb->buffer_offset;
1606 }
1607
1608 struct tc_constant_buffer *p =
1609 tc_add_call(tc, TC_CALL_set_constant_buffer, tc_constant_buffer);
1610 p->base.shader = shader;
1611 p->base.index = index;
1612 p->base.is_null = false;
1613 p->cb.user_buffer = NULL;
1614 p->cb.buffer_offset = offset;
1615 p->cb.buffer_size = cb->buffer_size;
1616
1617 if (take_ownership)
1618 p->cb.buffer = buffer;
1619 else
1620 tc_set_resource_reference(&p->cb.buffer, buffer);
1621
1622 if (buffer) {
1623 tc_bind_buffer(&tc->const_buffers[shader][index],
1624 &tc->buffer_lists[tc->next_buf_list], buffer);
1625 } else {
1626 tc_unbind_buffer(&tc->const_buffers[shader][index]);
1627 }
1628 }
1629
1630 struct tc_inlinable_constants {
1631 struct tc_call_base base;
1632 uint8_t shader;
1633 uint8_t num_values;
1634 uint32_t values[MAX_INLINABLE_UNIFORMS];
1635 };
1636
1637 static uint16_t ALWAYS_INLINE
tc_call_set_inlinable_constants(struct pipe_context * pipe,void * call)1638 tc_call_set_inlinable_constants(struct pipe_context *pipe, void *call)
1639 {
1640 struct tc_inlinable_constants *p = to_call(call, tc_inlinable_constants);
1641
1642 pipe->set_inlinable_constants(pipe, p->shader, p->num_values, p->values);
1643 return call_size(tc_inlinable_constants);
1644 }
1645
1646 static void
tc_set_inlinable_constants(struct pipe_context * _pipe,enum pipe_shader_type shader,uint num_values,uint32_t * values)1647 tc_set_inlinable_constants(struct pipe_context *_pipe,
1648 enum pipe_shader_type shader,
1649 uint num_values, uint32_t *values)
1650 {
1651 struct threaded_context *tc = threaded_context(_pipe);
1652 struct tc_inlinable_constants *p =
1653 tc_add_call(tc, TC_CALL_set_inlinable_constants, tc_inlinable_constants);
1654 p->shader = shader;
1655 p->num_values = num_values;
1656 memcpy(p->values, values, num_values * 4);
1657 }
1658
1659 struct tc_sample_locations {
1660 struct tc_call_base base;
1661 uint16_t size;
1662 uint8_t slot[0];
1663 };
1664
1665
1666 static uint16_t ALWAYS_INLINE
tc_call_set_sample_locations(struct pipe_context * pipe,void * call)1667 tc_call_set_sample_locations(struct pipe_context *pipe, void *call)
1668 {
1669 struct tc_sample_locations *p = (struct tc_sample_locations *)call;
1670
1671 pipe->set_sample_locations(pipe, p->size, p->slot);
1672 return p->base.num_slots;
1673 }
1674
1675 static void
tc_set_sample_locations(struct pipe_context * _pipe,size_t size,const uint8_t * locations)1676 tc_set_sample_locations(struct pipe_context *_pipe, size_t size, const uint8_t *locations)
1677 {
1678 struct threaded_context *tc = threaded_context(_pipe);
1679 struct tc_sample_locations *p =
1680 tc_add_slot_based_call(tc, TC_CALL_set_sample_locations,
1681 tc_sample_locations, size);
1682
1683 p->size = size;
1684 memcpy(p->slot, locations, size);
1685 }
1686
1687 struct tc_scissors {
1688 struct tc_call_base base;
1689 uint8_t start, count;
1690 struct pipe_scissor_state slot[0]; /* more will be allocated if needed */
1691 };
1692
1693 static uint16_t ALWAYS_INLINE
tc_call_set_scissor_states(struct pipe_context * pipe,void * call)1694 tc_call_set_scissor_states(struct pipe_context *pipe, void *call)
1695 {
1696 struct tc_scissors *p = (struct tc_scissors *)call;
1697
1698 pipe->set_scissor_states(pipe, p->start, p->count, p->slot);
1699 return p->base.num_slots;
1700 }
1701
1702 static void
tc_set_scissor_states(struct pipe_context * _pipe,unsigned start,unsigned count,const struct pipe_scissor_state * states)1703 tc_set_scissor_states(struct pipe_context *_pipe,
1704 unsigned start, unsigned count,
1705 const struct pipe_scissor_state *states)
1706 {
1707 struct threaded_context *tc = threaded_context(_pipe);
1708 struct tc_scissors *p =
1709 tc_add_slot_based_call(tc, TC_CALL_set_scissor_states, tc_scissors, count);
1710
1711 p->start = start;
1712 p->count = count;
1713 memcpy(&p->slot, states, count * sizeof(states[0]));
1714 }
1715
1716 struct tc_viewports {
1717 struct tc_call_base base;
1718 uint8_t start, count;
1719 struct pipe_viewport_state slot[0]; /* more will be allocated if needed */
1720 };
1721
1722 static uint16_t ALWAYS_INLINE
tc_call_set_viewport_states(struct pipe_context * pipe,void * call)1723 tc_call_set_viewport_states(struct pipe_context *pipe, void *call)
1724 {
1725 struct tc_viewports *p = (struct tc_viewports *)call;
1726
1727 pipe->set_viewport_states(pipe, p->start, p->count, p->slot);
1728 return p->base.num_slots;
1729 }
1730
1731 static void
tc_set_viewport_states(struct pipe_context * _pipe,unsigned start,unsigned count,const struct pipe_viewport_state * states)1732 tc_set_viewport_states(struct pipe_context *_pipe,
1733 unsigned start, unsigned count,
1734 const struct pipe_viewport_state *states)
1735 {
1736 if (!count)
1737 return;
1738
1739 struct threaded_context *tc = threaded_context(_pipe);
1740 struct tc_viewports *p =
1741 tc_add_slot_based_call(tc, TC_CALL_set_viewport_states, tc_viewports, count);
1742
1743 p->start = start;
1744 p->count = count;
1745 memcpy(&p->slot, states, count * sizeof(states[0]));
1746 }
1747
1748 struct tc_window_rects {
1749 struct tc_call_base base;
1750 bool include;
1751 uint8_t count;
1752 struct pipe_scissor_state slot[0]; /* more will be allocated if needed */
1753 };
1754
1755 static uint16_t ALWAYS_INLINE
tc_call_set_window_rectangles(struct pipe_context * pipe,void * call)1756 tc_call_set_window_rectangles(struct pipe_context *pipe, void *call)
1757 {
1758 struct tc_window_rects *p = (struct tc_window_rects *)call;
1759
1760 pipe->set_window_rectangles(pipe, p->include, p->count, p->slot);
1761 return p->base.num_slots;
1762 }
1763
1764 static void
tc_set_window_rectangles(struct pipe_context * _pipe,bool include,unsigned count,const struct pipe_scissor_state * rects)1765 tc_set_window_rectangles(struct pipe_context *_pipe, bool include,
1766 unsigned count,
1767 const struct pipe_scissor_state *rects)
1768 {
1769 struct threaded_context *tc = threaded_context(_pipe);
1770 struct tc_window_rects *p =
1771 tc_add_slot_based_call(tc, TC_CALL_set_window_rectangles, tc_window_rects, count);
1772
1773 p->include = include;
1774 p->count = count;
1775 memcpy(p->slot, rects, count * sizeof(rects[0]));
1776 }
1777
1778 struct tc_sampler_views {
1779 struct tc_call_base base;
1780 uint8_t shader, start, count, unbind_num_trailing_slots;
1781 struct pipe_sampler_view *slot[0]; /* more will be allocated if needed */
1782 };
1783
1784 static uint16_t ALWAYS_INLINE
tc_call_set_sampler_views(struct pipe_context * pipe,void * call)1785 tc_call_set_sampler_views(struct pipe_context *pipe, void *call)
1786 {
1787 struct tc_sampler_views *p = (struct tc_sampler_views *)call;
1788
1789 pipe->set_sampler_views(pipe, p->shader, p->start, p->count,
1790 p->unbind_num_trailing_slots, true, p->slot);
1791 return p->base.num_slots;
1792 }
1793
1794 static void
tc_set_sampler_views(struct pipe_context * _pipe,enum pipe_shader_type shader,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)1795 tc_set_sampler_views(struct pipe_context *_pipe,
1796 enum pipe_shader_type shader,
1797 unsigned start, unsigned count,
1798 unsigned unbind_num_trailing_slots, bool take_ownership,
1799 struct pipe_sampler_view **views)
1800 {
1801 if (!count && !unbind_num_trailing_slots)
1802 return;
1803
1804 struct threaded_context *tc = threaded_context(_pipe);
1805 struct tc_sampler_views *p =
1806 tc_add_slot_based_call(tc, TC_CALL_set_sampler_views, tc_sampler_views,
1807 views ? count : 0);
1808
1809 p->shader = shader;
1810 p->start = start;
1811
1812 if (views) {
1813 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
1814
1815 p->count = count;
1816 p->unbind_num_trailing_slots = unbind_num_trailing_slots;
1817
1818 if (take_ownership) {
1819 memcpy(p->slot, views, sizeof(*views) * count);
1820
1821 for (unsigned i = 0; i < count; i++) {
1822 if (views[i]) {
1823 if (views[i]->target == PIPE_BUFFER)
1824 tc_bind_buffer(&tc->sampler_buffers[shader][start + i], next,
1825 views[i]->texture);
1826 else
1827 tc_set_resource_batch_usage(tc, views[i]->texture);
1828 } else {
1829 tc_unbind_buffer(&tc->sampler_buffers[shader][start + i]);
1830 }
1831 }
1832 } else {
1833 for (unsigned i = 0; i < count; i++) {
1834 p->slot[i] = NULL;
1835 pipe_sampler_view_reference(&p->slot[i], views[i]);
1836
1837 if (views[i]) {
1838 if (views[i]->target == PIPE_BUFFER)
1839 tc_bind_buffer(&tc->sampler_buffers[shader][start + i], next,
1840 views[i]->texture);
1841 else
1842 tc_set_resource_batch_usage(tc, views[i]->texture);
1843 } else {
1844 tc_unbind_buffer(&tc->sampler_buffers[shader][start + i]);
1845 }
1846 }
1847 }
1848
1849 tc_unbind_buffers(&tc->sampler_buffers[shader][start + count],
1850 unbind_num_trailing_slots);
1851 tc->seen_sampler_buffers[shader] = true;
1852 } else {
1853 p->count = 0;
1854 p->unbind_num_trailing_slots = count + unbind_num_trailing_slots;
1855
1856 tc_unbind_buffers(&tc->sampler_buffers[shader][start],
1857 count + unbind_num_trailing_slots);
1858 }
1859 }
1860
1861 struct tc_shader_images {
1862 struct tc_call_base base;
1863 uint8_t shader, start, count;
1864 uint8_t unbind_num_trailing_slots;
1865 struct pipe_image_view slot[0]; /* more will be allocated if needed */
1866 };
1867
1868 static uint16_t ALWAYS_INLINE
tc_call_set_shader_images(struct pipe_context * pipe,void * call)1869 tc_call_set_shader_images(struct pipe_context *pipe, void *call)
1870 {
1871 struct tc_shader_images *p = (struct tc_shader_images *)call;
1872 unsigned count = p->count;
1873
1874 if (!p->count) {
1875 pipe->set_shader_images(pipe, p->shader, p->start, 0,
1876 p->unbind_num_trailing_slots, NULL);
1877 return call_size(tc_shader_images);
1878 }
1879
1880 pipe->set_shader_images(pipe, p->shader, p->start, p->count,
1881 p->unbind_num_trailing_slots, p->slot);
1882
1883 for (unsigned i = 0; i < count; i++)
1884 tc_drop_resource_reference(p->slot[i].resource);
1885
1886 return p->base.num_slots;
1887 }
1888
1889 static void
tc_set_shader_images(struct pipe_context * _pipe,enum pipe_shader_type shader,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * images)1890 tc_set_shader_images(struct pipe_context *_pipe,
1891 enum pipe_shader_type shader,
1892 unsigned start, unsigned count,
1893 unsigned unbind_num_trailing_slots,
1894 const struct pipe_image_view *images)
1895 {
1896 if (!count && !unbind_num_trailing_slots)
1897 return;
1898
1899 struct threaded_context *tc = threaded_context(_pipe);
1900 struct tc_shader_images *p =
1901 tc_add_slot_based_call(tc, TC_CALL_set_shader_images, tc_shader_images,
1902 images ? count : 0);
1903 unsigned writable_buffers = 0;
1904
1905 p->shader = shader;
1906 p->start = start;
1907
1908 if (images) {
1909 p->count = count;
1910 p->unbind_num_trailing_slots = unbind_num_trailing_slots;
1911
1912 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
1913
1914 for (unsigned i = 0; i < count; i++) {
1915 struct pipe_resource *resource = images[i].resource;
1916
1917 tc_set_resource_reference(&p->slot[i].resource, resource);
1918
1919 if (resource) {
1920 if (resource->target == PIPE_BUFFER) {
1921 tc_bind_buffer(&tc->image_buffers[shader][start + i], next, resource);
1922
1923 if (images[i].access & PIPE_IMAGE_ACCESS_WRITE) {
1924 struct threaded_resource *tres = threaded_resource(resource);
1925
1926 tc_buffer_disable_cpu_storage(resource);
1927 util_range_add(&tres->b, &tres->valid_buffer_range,
1928 images[i].u.buf.offset,
1929 images[i].u.buf.offset + images[i].u.buf.size);
1930 writable_buffers |= BITFIELD_BIT(start + i);
1931 }
1932 } else {
1933 tc_set_resource_batch_usage(tc, resource);
1934 }
1935 } else {
1936 tc_unbind_buffer(&tc->image_buffers[shader][start + i]);
1937 }
1938 }
1939 memcpy(p->slot, images, count * sizeof(images[0]));
1940
1941 tc_unbind_buffers(&tc->image_buffers[shader][start + count],
1942 unbind_num_trailing_slots);
1943 tc->seen_image_buffers[shader] = true;
1944 } else {
1945 p->count = 0;
1946 p->unbind_num_trailing_slots = count + unbind_num_trailing_slots;
1947
1948 tc_unbind_buffers(&tc->image_buffers[shader][start],
1949 count + unbind_num_trailing_slots);
1950 }
1951
1952 tc->image_buffers_writeable_mask[shader] &= ~BITFIELD_RANGE(start, count);
1953 tc->image_buffers_writeable_mask[shader] |= writable_buffers;
1954 }
1955
1956 struct tc_shader_buffers {
1957 struct tc_call_base base;
1958 uint8_t shader, start, count;
1959 bool unbind;
1960 unsigned writable_bitmask;
1961 struct pipe_shader_buffer slot[0]; /* more will be allocated if needed */
1962 };
1963
1964 static uint16_t ALWAYS_INLINE
tc_call_set_shader_buffers(struct pipe_context * pipe,void * call)1965 tc_call_set_shader_buffers(struct pipe_context *pipe, void *call)
1966 {
1967 struct tc_shader_buffers *p = (struct tc_shader_buffers *)call;
1968 unsigned count = p->count;
1969
1970 if (p->unbind) {
1971 pipe->set_shader_buffers(pipe, p->shader, p->start, p->count, NULL, 0);
1972 return call_size(tc_shader_buffers);
1973 }
1974
1975 pipe->set_shader_buffers(pipe, p->shader, p->start, p->count, p->slot,
1976 p->writable_bitmask);
1977
1978 for (unsigned i = 0; i < count; i++)
1979 tc_drop_resource_reference(p->slot[i].buffer);
1980
1981 return p->base.num_slots;
1982 }
1983
1984 static void
tc_set_shader_buffers(struct pipe_context * _pipe,enum pipe_shader_type shader,unsigned start,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)1985 tc_set_shader_buffers(struct pipe_context *_pipe,
1986 enum pipe_shader_type shader,
1987 unsigned start, unsigned count,
1988 const struct pipe_shader_buffer *buffers,
1989 unsigned writable_bitmask)
1990 {
1991 if (!count)
1992 return;
1993
1994 struct threaded_context *tc = threaded_context(_pipe);
1995 struct tc_shader_buffers *p =
1996 tc_add_slot_based_call(tc, TC_CALL_set_shader_buffers, tc_shader_buffers,
1997 buffers ? count : 0);
1998
1999 p->shader = shader;
2000 p->start = start;
2001 p->count = count;
2002 p->unbind = buffers == NULL;
2003 p->writable_bitmask = writable_bitmask;
2004
2005 if (buffers) {
2006 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
2007
2008 for (unsigned i = 0; i < count; i++) {
2009 struct pipe_shader_buffer *dst = &p->slot[i];
2010 const struct pipe_shader_buffer *src = buffers + i;
2011
2012 tc_set_resource_reference(&dst->buffer, src->buffer);
2013 dst->buffer_offset = src->buffer_offset;
2014 dst->buffer_size = src->buffer_size;
2015
2016 if (src->buffer) {
2017 struct threaded_resource *tres = threaded_resource(src->buffer);
2018
2019 tc_bind_buffer(&tc->shader_buffers[shader][start + i], next, &tres->b);
2020
2021 if (writable_bitmask & BITFIELD_BIT(i)) {
2022 tc_buffer_disable_cpu_storage(src->buffer);
2023 util_range_add(&tres->b, &tres->valid_buffer_range,
2024 src->buffer_offset,
2025 src->buffer_offset + src->buffer_size);
2026 }
2027 } else {
2028 tc_unbind_buffer(&tc->shader_buffers[shader][start + i]);
2029 }
2030 }
2031 tc->seen_shader_buffers[shader] = true;
2032 } else {
2033 tc_unbind_buffers(&tc->shader_buffers[shader][start], count);
2034 }
2035
2036 tc->shader_buffers_writeable_mask[shader] &= ~BITFIELD_RANGE(start, count);
2037 tc->shader_buffers_writeable_mask[shader] |= writable_bitmask << start;
2038 }
2039
2040 static uint16_t ALWAYS_INLINE
tc_call_set_vertex_buffers(struct pipe_context * pipe,void * call)2041 tc_call_set_vertex_buffers(struct pipe_context *pipe, void *call)
2042 {
2043 struct tc_vertex_buffers *p = (struct tc_vertex_buffers *)call;
2044 unsigned count = p->count;
2045
2046 for (unsigned i = 0; i < count; i++)
2047 tc_assert(!p->slot[i].is_user_buffer);
2048
2049 pipe->set_vertex_buffers(pipe, count, p->slot);
2050 return p->base.num_slots;
2051 }
2052
2053 static void
tc_set_vertex_buffers(struct pipe_context * _pipe,unsigned count,const struct pipe_vertex_buffer * buffers)2054 tc_set_vertex_buffers(struct pipe_context *_pipe, unsigned count,
2055 const struct pipe_vertex_buffer *buffers)
2056 {
2057 struct threaded_context *tc = threaded_context(_pipe);
2058
2059 assert(!count || buffers);
2060
2061 if (count) {
2062 struct tc_vertex_buffers *p =
2063 tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, count);
2064 p->count = count;
2065
2066 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
2067
2068 memcpy(p->slot, buffers, count * sizeof(struct pipe_vertex_buffer));
2069
2070 for (unsigned i = 0; i < count; i++) {
2071 struct pipe_resource *buf = buffers[i].buffer.resource;
2072
2073 if (buf) {
2074 tc_bind_buffer(&tc->vertex_buffers[i], next, buf);
2075 } else {
2076 tc_unbind_buffer(&tc->vertex_buffers[i]);
2077 }
2078 }
2079 } else {
2080 struct tc_vertex_buffers *p =
2081 tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, 0);
2082 p->count = 0;
2083 }
2084
2085 /* We don't need to unbind trailing buffers because we never touch bindings
2086 * after num_vertex_buffers.
2087 */
2088 tc->num_vertex_buffers = count;
2089 }
2090
2091 struct pipe_vertex_buffer *
tc_add_set_vertex_buffers_call(struct pipe_context * _pipe,unsigned count)2092 tc_add_set_vertex_buffers_call(struct pipe_context *_pipe, unsigned count)
2093 {
2094 struct threaded_context *tc = threaded_context(_pipe);
2095
2096 /* We don't need to unbind trailing buffers because we never touch bindings
2097 * after num_vertex_buffers.
2098 */
2099 tc->num_vertex_buffers = count;
2100
2101 struct tc_vertex_buffers *p =
2102 tc_add_slot_based_call(tc, TC_CALL_set_vertex_buffers, tc_vertex_buffers, count);
2103 p->count = count;
2104 return p->slot;
2105 }
2106
2107 struct tc_stream_outputs {
2108 struct tc_call_base base;
2109 uint8_t count;
2110 uint8_t output_prim;
2111 struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
2112 unsigned offsets[PIPE_MAX_SO_BUFFERS];
2113 };
2114
2115 static uint16_t ALWAYS_INLINE
tc_call_set_stream_output_targets(struct pipe_context * pipe,void * call)2116 tc_call_set_stream_output_targets(struct pipe_context *pipe, void *call)
2117 {
2118 struct tc_stream_outputs *p = to_call(call, tc_stream_outputs);
2119 unsigned count = p->count;
2120
2121 pipe->set_stream_output_targets(pipe, count, p->targets, p->offsets,
2122 p->output_prim);
2123 for (unsigned i = 0; i < count; i++)
2124 tc_drop_so_target_reference(p->targets[i]);
2125
2126 return call_size(tc_stream_outputs);
2127 }
2128
2129 static void
tc_set_stream_output_targets(struct pipe_context * _pipe,unsigned count,struct pipe_stream_output_target ** tgs,const unsigned * offsets,enum mesa_prim output_prim)2130 tc_set_stream_output_targets(struct pipe_context *_pipe,
2131 unsigned count,
2132 struct pipe_stream_output_target **tgs,
2133 const unsigned *offsets,
2134 enum mesa_prim output_prim)
2135 {
2136 struct threaded_context *tc = threaded_context(_pipe);
2137 struct tc_stream_outputs *p =
2138 tc_add_call(tc, TC_CALL_set_stream_output_targets, tc_stream_outputs);
2139 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
2140
2141 for (unsigned i = 0; i < count; i++) {
2142 p->targets[i] = NULL;
2143 pipe_so_target_reference(&p->targets[i], tgs[i]);
2144 if (tgs[i]) {
2145 tc_buffer_disable_cpu_storage(tgs[i]->buffer);
2146 tc_bind_buffer(&tc->streamout_buffers[i], next, tgs[i]->buffer);
2147 } else {
2148 tc_unbind_buffer(&tc->streamout_buffers[i]);
2149 }
2150 }
2151 p->count = count;
2152 p->output_prim = output_prim;
2153 memcpy(p->offsets, offsets, count * sizeof(unsigned));
2154
2155 tc_unbind_buffers(&tc->streamout_buffers[count], PIPE_MAX_SO_BUFFERS - count);
2156 if (count)
2157 tc->seen_streamout_buffers = true;
2158 }
2159
2160 static void
tc_set_compute_resources(struct pipe_context * _pipe,unsigned start,unsigned count,struct pipe_surface ** resources)2161 tc_set_compute_resources(struct pipe_context *_pipe, unsigned start,
2162 unsigned count, struct pipe_surface **resources)
2163 {
2164 struct threaded_context *tc = threaded_context(_pipe);
2165 struct pipe_context *pipe = tc->pipe;
2166
2167 tc_sync(tc);
2168 pipe->set_compute_resources(pipe, start, count, resources);
2169 }
2170
2171 static void
tc_set_global_binding(struct pipe_context * _pipe,unsigned first,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)2172 tc_set_global_binding(struct pipe_context *_pipe, unsigned first,
2173 unsigned count, struct pipe_resource **resources,
2174 uint32_t **handles)
2175 {
2176 struct threaded_context *tc = threaded_context(_pipe);
2177 struct pipe_context *pipe = tc->pipe;
2178
2179 tc_sync(tc);
2180 pipe->set_global_binding(pipe, first, count, resources, handles);
2181 }
2182
2183
2184 /********************************************************************
2185 * views
2186 */
2187
2188 static struct pipe_surface *
tc_create_surface(struct pipe_context * _pipe,struct pipe_resource * resource,const struct pipe_surface * surf_tmpl)2189 tc_create_surface(struct pipe_context *_pipe,
2190 struct pipe_resource *resource,
2191 const struct pipe_surface *surf_tmpl)
2192 {
2193 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2194 struct pipe_surface *view =
2195 pipe->create_surface(pipe, resource, surf_tmpl);
2196
2197 if (view)
2198 view->context = _pipe;
2199 return view;
2200 }
2201
2202 static void
tc_surface_destroy(struct pipe_context * _pipe,struct pipe_surface * surf)2203 tc_surface_destroy(struct pipe_context *_pipe,
2204 struct pipe_surface *surf)
2205 {
2206 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2207
2208 pipe->surface_destroy(pipe, surf);
2209 }
2210
2211 static struct pipe_sampler_view *
tc_create_sampler_view(struct pipe_context * _pipe,struct pipe_resource * resource,const struct pipe_sampler_view * templ)2212 tc_create_sampler_view(struct pipe_context *_pipe,
2213 struct pipe_resource *resource,
2214 const struct pipe_sampler_view *templ)
2215 {
2216 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2217 struct pipe_sampler_view *view =
2218 pipe->create_sampler_view(pipe, resource, templ);
2219
2220 if (view)
2221 view->context = _pipe;
2222 return view;
2223 }
2224
2225 static void
tc_sampler_view_destroy(struct pipe_context * _pipe,struct pipe_sampler_view * view)2226 tc_sampler_view_destroy(struct pipe_context *_pipe,
2227 struct pipe_sampler_view *view)
2228 {
2229 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2230
2231 pipe->sampler_view_destroy(pipe, view);
2232 }
2233
2234 static struct pipe_stream_output_target *
tc_create_stream_output_target(struct pipe_context * _pipe,struct pipe_resource * res,unsigned buffer_offset,unsigned buffer_size)2235 tc_create_stream_output_target(struct pipe_context *_pipe,
2236 struct pipe_resource *res,
2237 unsigned buffer_offset,
2238 unsigned buffer_size)
2239 {
2240 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2241 struct threaded_resource *tres = threaded_resource(res);
2242 struct pipe_stream_output_target *view;
2243
2244 util_range_add(&tres->b, &tres->valid_buffer_range, buffer_offset,
2245 buffer_offset + buffer_size);
2246
2247 view = pipe->create_stream_output_target(pipe, res, buffer_offset,
2248 buffer_size);
2249 if (view)
2250 view->context = _pipe;
2251 return view;
2252 }
2253
2254 static void
tc_stream_output_target_destroy(struct pipe_context * _pipe,struct pipe_stream_output_target * target)2255 tc_stream_output_target_destroy(struct pipe_context *_pipe,
2256 struct pipe_stream_output_target *target)
2257 {
2258 struct pipe_context *pipe = threaded_context(_pipe)->pipe;
2259
2260 pipe->stream_output_target_destroy(pipe, target);
2261 }
2262
2263
2264 /********************************************************************
2265 * bindless
2266 */
2267
2268 static uint64_t
tc_create_texture_handle(struct pipe_context * _pipe,struct pipe_sampler_view * view,const struct pipe_sampler_state * state)2269 tc_create_texture_handle(struct pipe_context *_pipe,
2270 struct pipe_sampler_view *view,
2271 const struct pipe_sampler_state *state)
2272 {
2273 struct threaded_context *tc = threaded_context(_pipe);
2274 struct pipe_context *pipe = tc->pipe;
2275
2276 tc_sync(tc);
2277 return pipe->create_texture_handle(pipe, view, state);
2278 }
2279
2280 struct tc_make_texture_handle_resident {
2281 struct tc_call_base base;
2282 bool resident;
2283 uint64_t handle;
2284 };
2285
2286 static uint16_t ALWAYS_INLINE
tc_call_make_texture_handle_resident(struct pipe_context * pipe,void * call)2287 tc_call_make_texture_handle_resident(struct pipe_context *pipe, void *call)
2288 {
2289 struct tc_make_texture_handle_resident *p =
2290 to_call(call, tc_make_texture_handle_resident);
2291
2292 pipe->make_texture_handle_resident(pipe, p->handle, p->resident);
2293 return call_size(tc_make_texture_handle_resident);
2294 }
2295
2296 static void
tc_make_texture_handle_resident(struct pipe_context * _pipe,uint64_t handle,bool resident)2297 tc_make_texture_handle_resident(struct pipe_context *_pipe, uint64_t handle,
2298 bool resident)
2299 {
2300 struct threaded_context *tc = threaded_context(_pipe);
2301 struct tc_make_texture_handle_resident *p =
2302 tc_add_call(tc, TC_CALL_make_texture_handle_resident,
2303 tc_make_texture_handle_resident);
2304
2305 p->handle = handle;
2306 p->resident = resident;
2307 }
2308
2309 static uint64_t
tc_create_image_handle(struct pipe_context * _pipe,const struct pipe_image_view * image)2310 tc_create_image_handle(struct pipe_context *_pipe,
2311 const struct pipe_image_view *image)
2312 {
2313 struct threaded_context *tc = threaded_context(_pipe);
2314 struct pipe_context *pipe = tc->pipe;
2315 struct pipe_resource *resource = image->resource;
2316
2317 if (image->access & PIPE_IMAGE_ACCESS_WRITE &&
2318 resource && resource->target == PIPE_BUFFER) {
2319 struct threaded_resource *tres = threaded_resource(resource);
2320
2321 /* The CPU storage doesn't support writable buffer. */
2322 tc_buffer_disable_cpu_storage(resource);
2323
2324 util_range_add(&tres->b, &tres->valid_buffer_range,
2325 image->u.buf.offset,
2326 image->u.buf.offset + image->u.buf.size);
2327 }
2328
2329 tc_sync(tc);
2330 return pipe->create_image_handle(pipe, image);
2331 }
2332
2333 struct tc_make_image_handle_resident {
2334 struct tc_call_base base;
2335 bool resident;
2336 unsigned access;
2337 uint64_t handle;
2338 };
2339
2340 static uint16_t ALWAYS_INLINE
tc_call_make_image_handle_resident(struct pipe_context * pipe,void * call)2341 tc_call_make_image_handle_resident(struct pipe_context *pipe, void *call)
2342 {
2343 struct tc_make_image_handle_resident *p =
2344 to_call(call, tc_make_image_handle_resident);
2345
2346 pipe->make_image_handle_resident(pipe, p->handle, p->access, p->resident);
2347 return call_size(tc_make_image_handle_resident);
2348 }
2349
2350 static void
tc_make_image_handle_resident(struct pipe_context * _pipe,uint64_t handle,unsigned access,bool resident)2351 tc_make_image_handle_resident(struct pipe_context *_pipe, uint64_t handle,
2352 unsigned access, bool resident)
2353 {
2354 struct threaded_context *tc = threaded_context(_pipe);
2355 struct tc_make_image_handle_resident *p =
2356 tc_add_call(tc, TC_CALL_make_image_handle_resident,
2357 tc_make_image_handle_resident);
2358
2359 p->handle = handle;
2360 p->access = access;
2361 p->resident = resident;
2362 }
2363
2364
2365 /********************************************************************
2366 * transfer
2367 */
2368
2369 static void
2370 tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
2371 unsigned flags);
2372
2373 struct tc_replace_buffer_storage {
2374 struct tc_call_base base;
2375 uint16_t num_rebinds;
2376 uint32_t rebind_mask;
2377 uint32_t delete_buffer_id;
2378 struct pipe_resource *dst;
2379 struct pipe_resource *src;
2380 tc_replace_buffer_storage_func func;
2381 };
2382
2383 static uint16_t ALWAYS_INLINE
tc_call_replace_buffer_storage(struct pipe_context * pipe,void * call)2384 tc_call_replace_buffer_storage(struct pipe_context *pipe, void *call)
2385 {
2386 struct tc_replace_buffer_storage *p = to_call(call, tc_replace_buffer_storage);
2387
2388 p->func(pipe, p->dst, p->src, p->num_rebinds, p->rebind_mask, p->delete_buffer_id);
2389
2390 tc_drop_resource_reference(p->dst);
2391 tc_drop_resource_reference(p->src);
2392 return call_size(tc_replace_buffer_storage);
2393 }
2394
2395 /* Return true if the buffer has been invalidated or is idle. */
2396 static bool
tc_invalidate_buffer(struct threaded_context * tc,struct threaded_resource * tbuf)2397 tc_invalidate_buffer(struct threaded_context *tc,
2398 struct threaded_resource *tbuf)
2399 {
2400 if (!tc_is_buffer_busy(tc, tbuf, PIPE_MAP_READ_WRITE)) {
2401 /* It's idle, so invalidation would be a no-op, but we can still clear
2402 * the valid range because we are technically doing invalidation, but
2403 * skipping it because it's useless.
2404 *
2405 * If the buffer is bound for write, we can't invalidate the range.
2406 */
2407 if (!tc_is_buffer_bound_for_write(tc, tbuf->buffer_id_unique))
2408 util_range_set_empty(&tbuf->valid_buffer_range);
2409 return true;
2410 }
2411
2412 struct pipe_screen *screen = tc->base.screen;
2413 struct pipe_resource *new_buf;
2414
2415 /* Shared, pinned, and sparse buffers can't be reallocated. */
2416 if (tbuf->is_shared ||
2417 tbuf->is_user_ptr ||
2418 tbuf->b.flags & (PIPE_RESOURCE_FLAG_SPARSE | PIPE_RESOURCE_FLAG_UNMAPPABLE))
2419 return false;
2420
2421 assert(tbuf->b.target == PIPE_BUFFER);
2422 tc->bytes_replaced_estimate += tbuf->b.width0;
2423
2424 if (tc->bytes_replaced_limit && (tc->bytes_replaced_estimate > tc->bytes_replaced_limit)) {
2425 tc_flush(&tc->base, NULL, PIPE_FLUSH_ASYNC);
2426 }
2427
2428 /* Allocate a new one. */
2429 new_buf = screen->resource_create(screen, &tbuf->b);
2430 if (!new_buf)
2431 return false;
2432
2433 /* Replace the "latest" pointer. */
2434 if (tbuf->latest != &tbuf->b)
2435 pipe_resource_reference(&tbuf->latest, NULL);
2436
2437 tbuf->latest = new_buf;
2438
2439 uint32_t delete_buffer_id = tbuf->buffer_id_unique;
2440
2441 /* Enqueue storage replacement of the original buffer. */
2442 struct tc_replace_buffer_storage *p =
2443 tc_add_call(tc, TC_CALL_replace_buffer_storage,
2444 tc_replace_buffer_storage);
2445
2446 p->func = tc->replace_buffer_storage;
2447 tc_set_resource_reference(&p->dst, &tbuf->b);
2448 tc_set_resource_reference(&p->src, new_buf);
2449 p->delete_buffer_id = delete_buffer_id;
2450 p->rebind_mask = 0;
2451
2452 /* Treat the current buffer as the new buffer. */
2453 bool bound_for_write = tc_is_buffer_bound_for_write(tc, tbuf->buffer_id_unique);
2454 p->num_rebinds = tc_rebind_buffer(tc, tbuf->buffer_id_unique,
2455 threaded_resource(new_buf)->buffer_id_unique,
2456 &p->rebind_mask);
2457
2458 /* If the buffer is not bound for write, clear the valid range. */
2459 if (!bound_for_write)
2460 util_range_set_empty(&tbuf->valid_buffer_range);
2461
2462 tbuf->buffer_id_unique = threaded_resource(new_buf)->buffer_id_unique;
2463 threaded_resource(new_buf)->buffer_id_unique = 0;
2464
2465 return true;
2466 }
2467
2468 static unsigned
tc_improve_map_buffer_flags(struct threaded_context * tc,struct threaded_resource * tres,unsigned usage,unsigned offset,unsigned size)2469 tc_improve_map_buffer_flags(struct threaded_context *tc,
2470 struct threaded_resource *tres, unsigned usage,
2471 unsigned offset, unsigned size)
2472 {
2473 /* Never invalidate inside the driver and never infer "unsynchronized". */
2474 unsigned tc_flags = TC_TRANSFER_MAP_NO_INVALIDATE |
2475 TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED;
2476
2477 /* Prevent a reentry. */
2478 if (usage & tc_flags)
2479 return usage;
2480
2481 /* Use the staging upload if it's preferred. */
2482 if (usage & (PIPE_MAP_DISCARD_RANGE |
2483 PIPE_MAP_DISCARD_WHOLE_RESOURCE) &&
2484 !(usage & PIPE_MAP_PERSISTENT) &&
2485 tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY &&
2486 tc->use_forced_staging_uploads) {
2487 usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE |
2488 PIPE_MAP_UNSYNCHRONIZED);
2489
2490 return usage | tc_flags | PIPE_MAP_DISCARD_RANGE;
2491 }
2492
2493 /* Sparse buffers can't be mapped directly and can't be reallocated
2494 * (fully invalidated). That may just be a radeonsi limitation, but
2495 * the threaded context must obey it with radeonsi.
2496 */
2497 if (tres->b.flags & (PIPE_RESOURCE_FLAG_SPARSE | PIPE_RESOURCE_FLAG_UNMAPPABLE)) {
2498 /* We can use DISCARD_RANGE instead of full discard. This is the only
2499 * fast path for sparse buffers that doesn't need thread synchronization.
2500 */
2501 if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE)
2502 usage |= PIPE_MAP_DISCARD_RANGE;
2503
2504 /* Allow DISCARD_WHOLE_RESOURCE and infering UNSYNCHRONIZED in drivers.
2505 * The threaded context doesn't do unsychronized mappings and invalida-
2506 * tions of sparse buffers, therefore a correct driver behavior won't
2507 * result in an incorrect behavior with the threaded context.
2508 */
2509 return usage;
2510 }
2511
2512 usage |= tc_flags;
2513
2514 /* Handle CPU reads trivially. */
2515 if (usage & PIPE_MAP_READ) {
2516 if (usage & PIPE_MAP_UNSYNCHRONIZED)
2517 usage |= TC_TRANSFER_MAP_THREADED_UNSYNC; /* don't sync */
2518
2519 /* Drivers aren't allowed to do buffer invalidations. */
2520 return usage & ~PIPE_MAP_DISCARD_WHOLE_RESOURCE;
2521 }
2522
2523 /* See if the buffer range being mapped has never been initialized or
2524 * the buffer is idle, in which case it can be mapped unsynchronized. */
2525 if (!(usage & PIPE_MAP_UNSYNCHRONIZED) &&
2526 ((!tres->is_shared &&
2527 !util_ranges_intersect(&tres->valid_buffer_range, offset, offset + size)) ||
2528 !tc_is_buffer_busy(tc, tres, usage)))
2529 usage |= PIPE_MAP_UNSYNCHRONIZED;
2530
2531 if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
2532 /* If discarding the entire valid range, discard the whole resource instead. */
2533 if (usage & PIPE_MAP_DISCARD_RANGE &&
2534 util_ranges_covered(&tres->valid_buffer_range, offset, offset + size))
2535 usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE;
2536
2537 /* Discard the whole resource if needed. */
2538 if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) {
2539 if (tc_invalidate_buffer(tc, tres))
2540 usage |= PIPE_MAP_UNSYNCHRONIZED;
2541 else
2542 usage |= PIPE_MAP_DISCARD_RANGE; /* fallback */
2543 }
2544 }
2545
2546 /* We won't need this flag anymore. */
2547 /* TODO: We might not need TC_TRANSFER_MAP_NO_INVALIDATE with this. */
2548 usage &= ~PIPE_MAP_DISCARD_WHOLE_RESOURCE;
2549
2550 /* GL_AMD_pinned_memory and persistent mappings can't use staging
2551 * buffers. */
2552 if (usage & (PIPE_MAP_UNSYNCHRONIZED |
2553 PIPE_MAP_PERSISTENT) ||
2554 tres->is_user_ptr)
2555 usage &= ~PIPE_MAP_DISCARD_RANGE;
2556
2557 /* Unsychronized buffer mappings don't have to synchronize the thread. */
2558 if (usage & PIPE_MAP_UNSYNCHRONIZED) {
2559 usage &= ~PIPE_MAP_DISCARD_RANGE;
2560 usage |= TC_TRANSFER_MAP_THREADED_UNSYNC; /* notify the driver */
2561 }
2562
2563 return usage;
2564 }
2565
2566 static void *
tc_buffer_map(struct pipe_context * _pipe,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** transfer)2567 tc_buffer_map(struct pipe_context *_pipe,
2568 struct pipe_resource *resource, unsigned level,
2569 unsigned usage, const struct pipe_box *box,
2570 struct pipe_transfer **transfer)
2571 {
2572 struct threaded_context *tc = threaded_context(_pipe);
2573 struct threaded_resource *tres = threaded_resource(resource);
2574 struct pipe_context *pipe = tc->pipe;
2575
2576 /* PIPE_MAP_THREAD_SAFE is for glthread, which shouldn't use the CPU storage and
2577 * this shouldn't normally be necessary because glthread only uses large buffers.
2578 */
2579 if (usage & PIPE_MAP_THREAD_SAFE)
2580 tc_buffer_disable_cpu_storage(resource);
2581
2582 usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width);
2583
2584 /* If the CPU storage is enabled, return it directly. */
2585 if (tres->allow_cpu_storage && !(usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
2586 /* We can't let resource_copy_region disable the CPU storage. */
2587 assert(!(tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY));
2588
2589 if (!tres->cpu_storage) {
2590 tres->cpu_storage = align_malloc(resource->width0, tc->map_buffer_alignment);
2591
2592 if (tres->cpu_storage && tres->valid_buffer_range.end) {
2593 /* The GPU buffer contains valid data. Copy them to the CPU storage. */
2594 struct pipe_box box2;
2595 struct pipe_transfer *transfer2;
2596
2597 unsigned valid_range_len = tres->valid_buffer_range.end - tres->valid_buffer_range.start;
2598 u_box_1d(tres->valid_buffer_range.start, valid_range_len, &box2);
2599
2600 tc_sync_msg(tc, "cpu storage GPU -> CPU copy");
2601 tc_set_driver_thread(tc);
2602
2603 void *ret = pipe->buffer_map(pipe, tres->latest ? tres->latest : resource,
2604 0, PIPE_MAP_READ, &box2, &transfer2);
2605 memcpy(&((uint8_t*)tres->cpu_storage)[tres->valid_buffer_range.start],
2606 ret,
2607 valid_range_len);
2608 pipe->buffer_unmap(pipe, transfer2);
2609
2610 tc_clear_driver_thread(tc);
2611 }
2612 }
2613
2614 if (tres->cpu_storage) {
2615 struct threaded_transfer *ttrans = slab_zalloc(&tc->pool_transfers);
2616 ttrans->b.resource = resource;
2617 ttrans->b.usage = usage;
2618 ttrans->b.box = *box;
2619 ttrans->valid_buffer_range = &tres->valid_buffer_range;
2620 ttrans->cpu_storage_mapped = true;
2621 *transfer = &ttrans->b;
2622
2623 return (uint8_t*)tres->cpu_storage + box->x;
2624 } else {
2625 tres->allow_cpu_storage = false;
2626 }
2627 }
2628
2629 /* Do a staging transfer within the threaded context. The driver should
2630 * only get resource_copy_region.
2631 */
2632 if (usage & PIPE_MAP_DISCARD_RANGE) {
2633 struct threaded_transfer *ttrans = slab_zalloc(&tc->pool_transfers);
2634 uint8_t *map;
2635
2636 u_upload_alloc(tc->base.stream_uploader, 0,
2637 box->width + (box->x % tc->map_buffer_alignment),
2638 tc->map_buffer_alignment, &ttrans->b.offset,
2639 &ttrans->staging, (void**)&map);
2640 if (!map) {
2641 slab_free(&tc->pool_transfers, ttrans);
2642 return NULL;
2643 }
2644
2645 ttrans->b.resource = resource;
2646 ttrans->b.level = 0;
2647 ttrans->b.usage = usage;
2648 ttrans->b.box = *box;
2649 ttrans->b.stride = 0;
2650 ttrans->b.layer_stride = 0;
2651 ttrans->valid_buffer_range = &tres->valid_buffer_range;
2652 ttrans->cpu_storage_mapped = false;
2653 *transfer = &ttrans->b;
2654
2655 p_atomic_inc(&tres->pending_staging_uploads);
2656 util_range_add(resource, &tres->pending_staging_uploads_range,
2657 box->x, box->x + box->width);
2658
2659 return map + (box->x % tc->map_buffer_alignment);
2660 }
2661
2662 if (usage & PIPE_MAP_UNSYNCHRONIZED &&
2663 p_atomic_read(&tres->pending_staging_uploads) &&
2664 util_ranges_intersect(&tres->pending_staging_uploads_range, box->x, box->x + box->width)) {
2665 /* Write conflict detected between a staging transfer and the direct mapping we're
2666 * going to do. Resolve the conflict by ignoring UNSYNCHRONIZED so the direct mapping
2667 * will have to wait for the staging transfer completion.
2668 * Note: The conflict detection is only based on the mapped range, not on the actual
2669 * written range(s).
2670 */
2671 usage &= ~PIPE_MAP_UNSYNCHRONIZED & ~TC_TRANSFER_MAP_THREADED_UNSYNC;
2672 tc->use_forced_staging_uploads = false;
2673 }
2674
2675 /* Unsychronized buffer mappings don't have to synchronize the thread. */
2676 if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC)) {
2677 tc_sync_msg(tc, usage & PIPE_MAP_DISCARD_RANGE ? " discard_range" :
2678 usage & PIPE_MAP_READ ? " read" : " staging conflict");
2679 tc_set_driver_thread(tc);
2680 }
2681
2682 tc->bytes_mapped_estimate += box->width;
2683
2684 void *ret = pipe->buffer_map(pipe, tres->latest ? tres->latest : resource,
2685 level, usage, box, transfer);
2686 threaded_transfer(*transfer)->valid_buffer_range = &tres->valid_buffer_range;
2687 threaded_transfer(*transfer)->cpu_storage_mapped = false;
2688
2689 if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
2690 tc_clear_driver_thread(tc);
2691
2692 return ret;
2693 }
2694
2695 static void *
tc_texture_map(struct pipe_context * _pipe,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** transfer)2696 tc_texture_map(struct pipe_context *_pipe,
2697 struct pipe_resource *resource, unsigned level,
2698 unsigned usage, const struct pipe_box *box,
2699 struct pipe_transfer **transfer)
2700 {
2701 struct threaded_context *tc = threaded_context(_pipe);
2702 struct threaded_resource *tres = threaded_resource(resource);
2703 struct pipe_context *pipe = tc->pipe;
2704
2705 tc_sync_msg(tc, "texture");
2706 tc_set_driver_thread(tc);
2707 /* block all unsync texture subdata during map */
2708 tc_set_resource_batch_usage_persistent(tc, resource, true);
2709
2710 tc->bytes_mapped_estimate += box->width;
2711
2712 void *ret = pipe->texture_map(pipe, tres->latest ? tres->latest : resource,
2713 level, usage, box, transfer);
2714
2715 if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
2716 tc_clear_driver_thread(tc);
2717
2718 return ret;
2719 }
2720
2721 struct tc_transfer_flush_region {
2722 struct tc_call_base base;
2723 struct pipe_box box;
2724 struct pipe_transfer *transfer;
2725 };
2726
2727 static uint16_t ALWAYS_INLINE
tc_call_transfer_flush_region(struct pipe_context * pipe,void * call)2728 tc_call_transfer_flush_region(struct pipe_context *pipe, void *call)
2729 {
2730 struct tc_transfer_flush_region *p = to_call(call, tc_transfer_flush_region);
2731
2732 pipe->transfer_flush_region(pipe, p->transfer, &p->box);
2733 return call_size(tc_transfer_flush_region);
2734 }
2735
2736 struct tc_resource_copy_region {
2737 struct tc_call_base base;
2738 unsigned dst_level;
2739 unsigned dstx, dsty, dstz;
2740 unsigned src_level;
2741 struct pipe_box src_box;
2742 struct pipe_resource *dst;
2743 struct pipe_resource *src;
2744 };
2745
2746 static void
2747 tc_resource_copy_region(struct pipe_context *_pipe,
2748 struct pipe_resource *dst, unsigned dst_level,
2749 unsigned dstx, unsigned dsty, unsigned dstz,
2750 struct pipe_resource *src, unsigned src_level,
2751 const struct pipe_box *src_box);
2752
2753 static void
tc_buffer_do_flush_region(struct threaded_context * tc,struct threaded_transfer * ttrans,const struct pipe_box * box)2754 tc_buffer_do_flush_region(struct threaded_context *tc,
2755 struct threaded_transfer *ttrans,
2756 const struct pipe_box *box)
2757 {
2758 struct threaded_resource *tres = threaded_resource(ttrans->b.resource);
2759
2760 if (ttrans->staging) {
2761 struct pipe_box src_box;
2762
2763 u_box_1d(ttrans->b.offset + ttrans->b.box.x % tc->map_buffer_alignment +
2764 (box->x - ttrans->b.box.x),
2765 box->width, &src_box);
2766
2767 /* Copy the staging buffer into the original one. */
2768 tc_resource_copy_region(&tc->base, ttrans->b.resource, 0, box->x, 0, 0,
2769 ttrans->staging, 0, &src_box);
2770 }
2771
2772 /* Don't update the valid range when we're uploading the CPU storage
2773 * because it includes the uninitialized range too.
2774 */
2775 if (!(ttrans->b.usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
2776 util_range_add(&tres->b, ttrans->valid_buffer_range,
2777 box->x, box->x + box->width);
2778 }
2779 }
2780
2781 static void
tc_transfer_flush_region(struct pipe_context * _pipe,struct pipe_transfer * transfer,const struct pipe_box * rel_box)2782 tc_transfer_flush_region(struct pipe_context *_pipe,
2783 struct pipe_transfer *transfer,
2784 const struct pipe_box *rel_box)
2785 {
2786 struct threaded_context *tc = threaded_context(_pipe);
2787 struct threaded_transfer *ttrans = threaded_transfer(transfer);
2788 struct threaded_resource *tres = threaded_resource(transfer->resource);
2789 unsigned required_usage = PIPE_MAP_WRITE |
2790 PIPE_MAP_FLUSH_EXPLICIT;
2791
2792 if (tres->b.target == PIPE_BUFFER) {
2793 if ((transfer->usage & required_usage) == required_usage) {
2794 struct pipe_box box;
2795
2796 u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
2797 tc_buffer_do_flush_region(tc, ttrans, &box);
2798 }
2799
2800 /* Staging transfers don't send the call to the driver.
2801 *
2802 * Transfers using the CPU storage shouldn't call transfer_flush_region
2803 * in the driver because the buffer is not really mapped on the driver
2804 * side and the CPU storage always re-uploads everything (flush_region
2805 * makes no difference).
2806 */
2807 if (ttrans->staging || ttrans->cpu_storage_mapped)
2808 return;
2809 }
2810
2811 struct tc_transfer_flush_region *p =
2812 tc_add_call(tc, TC_CALL_transfer_flush_region, tc_transfer_flush_region);
2813 p->transfer = transfer;
2814 p->box = *rel_box;
2815 }
2816
2817 struct tc_buffer_unmap {
2818 struct tc_call_base base;
2819 bool was_staging_transfer;
2820 union {
2821 struct pipe_transfer *transfer;
2822 struct pipe_resource *resource;
2823 };
2824 };
2825
2826 static uint16_t ALWAYS_INLINE
tc_call_buffer_unmap(struct pipe_context * pipe,void * call)2827 tc_call_buffer_unmap(struct pipe_context *pipe, void *call)
2828 {
2829 struct tc_buffer_unmap *p = to_call(call, tc_buffer_unmap);
2830
2831 if (p->was_staging_transfer) {
2832 struct threaded_resource *tres = threaded_resource(p->resource);
2833 /* Nothing to do except keeping track of staging uploads */
2834 assert(tres->pending_staging_uploads > 0);
2835 p_atomic_dec(&tres->pending_staging_uploads);
2836 tc_drop_resource_reference(p->resource);
2837 } else {
2838 pipe->buffer_unmap(pipe, p->transfer);
2839 }
2840
2841 return call_size(tc_buffer_unmap);
2842 }
2843
2844 static void
tc_buffer_unmap(struct pipe_context * _pipe,struct pipe_transfer * transfer)2845 tc_buffer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
2846 {
2847 struct threaded_context *tc = threaded_context(_pipe);
2848 struct threaded_transfer *ttrans = threaded_transfer(transfer);
2849 struct threaded_resource *tres = threaded_resource(transfer->resource);
2850
2851 /* PIPE_MAP_THREAD_SAFE is only valid with UNSYNCHRONIZED. It can be
2852 * called from any thread and bypasses all multithreaded queues.
2853 */
2854 if (transfer->usage & PIPE_MAP_THREAD_SAFE) {
2855 assert(transfer->usage & PIPE_MAP_UNSYNCHRONIZED);
2856 assert(!(transfer->usage & (PIPE_MAP_FLUSH_EXPLICIT |
2857 PIPE_MAP_DISCARD_RANGE)));
2858
2859 struct pipe_context *pipe = tc->pipe;
2860 util_range_add(&tres->b, ttrans->valid_buffer_range,
2861 transfer->box.x, transfer->box.x + transfer->box.width);
2862
2863 pipe->buffer_unmap(pipe, transfer);
2864 return;
2865 }
2866
2867 if (transfer->usage & PIPE_MAP_WRITE &&
2868 !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT))
2869 tc_buffer_do_flush_region(tc, ttrans, &transfer->box);
2870
2871 if (ttrans->cpu_storage_mapped) {
2872 /* GL allows simultaneous GPU stores with mapped buffers as long as GPU stores don't
2873 * touch the mapped range. That's a problem because GPU stores free the CPU storage.
2874 * If that happens, we just ignore the unmap call and don't upload anything to prevent
2875 * a crash.
2876 *
2877 * Disallow the CPU storage in the driver to work around this.
2878 */
2879 assert(tres->cpu_storage);
2880
2881 if (tres->cpu_storage) {
2882 tc_invalidate_buffer(tc, tres);
2883 tc_buffer_subdata(&tc->base, &tres->b,
2884 PIPE_MAP_UNSYNCHRONIZED |
2885 TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE,
2886 0, tres->b.width0, tres->cpu_storage);
2887 /* This shouldn't have been freed by buffer_subdata. */
2888 assert(tres->cpu_storage);
2889 } else {
2890 static bool warned_once = false;
2891 if (!warned_once) {
2892 fprintf(stderr, "This application is incompatible with cpu_storage.\n");
2893 fprintf(stderr, "Use tc_max_cpu_storage_size=0 to disable it and report this issue to Mesa.\n");
2894 warned_once = true;
2895 }
2896 }
2897
2898 tc_drop_resource_reference(ttrans->staging);
2899 slab_free(&tc->pool_transfers, ttrans);
2900 return;
2901 }
2902
2903 bool was_staging_transfer = false;
2904
2905 if (ttrans->staging) {
2906 was_staging_transfer = true;
2907
2908 tc_drop_resource_reference(ttrans->staging);
2909 slab_free(&tc->pool_transfers, ttrans);
2910 }
2911
2912 struct tc_buffer_unmap *p = tc_add_call(tc, TC_CALL_buffer_unmap,
2913 tc_buffer_unmap);
2914 if (was_staging_transfer) {
2915 tc_set_resource_reference(&p->resource, &tres->b);
2916 p->was_staging_transfer = true;
2917 } else {
2918 p->transfer = transfer;
2919 p->was_staging_transfer = false;
2920 }
2921
2922 /* tc_buffer_map directly maps the buffers, but tc_buffer_unmap
2923 * defers the unmap operation to the batch execution.
2924 * bytes_mapped_estimate is an estimation of the map/unmap bytes delta
2925 * and if it goes over an optional limit the current batch is flushed,
2926 * to reclaim some RAM. */
2927 if (!ttrans->staging && tc->bytes_mapped_limit &&
2928 tc->bytes_mapped_estimate > tc->bytes_mapped_limit) {
2929 tc_flush(_pipe, NULL, PIPE_FLUSH_ASYNC);
2930 }
2931 }
2932
2933 struct tc_texture_unmap {
2934 struct tc_call_base base;
2935 struct pipe_transfer *transfer;
2936 };
2937
2938 static uint16_t ALWAYS_INLINE
tc_call_texture_unmap(struct pipe_context * pipe,void * call)2939 tc_call_texture_unmap(struct pipe_context *pipe, void *call)
2940 {
2941 struct tc_texture_unmap *p = (struct tc_texture_unmap *) call;
2942
2943 pipe->texture_unmap(pipe, p->transfer);
2944 return call_size(tc_texture_unmap);
2945 }
2946
2947 static void
tc_texture_unmap(struct pipe_context * _pipe,struct pipe_transfer * transfer)2948 tc_texture_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
2949 {
2950 struct threaded_context *tc = threaded_context(_pipe);
2951 struct threaded_transfer *ttrans = threaded_transfer(transfer);
2952
2953 /* enable subdata again once resource is no longer mapped */
2954 tc_set_resource_batch_usage_persistent(tc, transfer->resource, false);
2955
2956 tc_add_call(tc, TC_CALL_texture_unmap, tc_texture_unmap)->transfer = transfer;
2957
2958 /* tc_texture_map directly maps the textures, but tc_texture_unmap
2959 * defers the unmap operation to the batch execution.
2960 * bytes_mapped_estimate is an estimation of the map/unmap bytes delta
2961 * and if it goes over an optional limit the current batch is flushed,
2962 * to reclaim some RAM. */
2963 if (!ttrans->staging && tc->bytes_mapped_limit &&
2964 tc->bytes_mapped_estimate > tc->bytes_mapped_limit) {
2965 tc_flush(_pipe, NULL, PIPE_FLUSH_ASYNC);
2966 }
2967 }
2968
2969 struct tc_buffer_subdata {
2970 struct tc_call_base base;
2971 unsigned usage, offset, size;
2972 struct pipe_resource *resource;
2973 char slot[0]; /* more will be allocated if needed */
2974 };
2975
2976 static uint16_t ALWAYS_INLINE
tc_call_buffer_subdata(struct pipe_context * pipe,void * call)2977 tc_call_buffer_subdata(struct pipe_context *pipe, void *call)
2978 {
2979 struct tc_buffer_subdata *p = (struct tc_buffer_subdata *)call;
2980
2981 pipe->buffer_subdata(pipe, p->resource, p->usage, p->offset, p->size,
2982 p->slot);
2983 tc_drop_resource_reference(p->resource);
2984 return p->base.num_slots;
2985 }
2986
2987 static bool
is_mergeable_buffer_subdata(const struct tc_call_base * previous_call,unsigned usage,unsigned offset,struct pipe_resource * resource)2988 is_mergeable_buffer_subdata(const struct tc_call_base *previous_call,
2989 unsigned usage, unsigned offset,
2990 struct pipe_resource *resource)
2991 {
2992 if (!previous_call || previous_call->call_id != TC_CALL_buffer_subdata)
2993 return false;
2994
2995 struct tc_buffer_subdata *subdata = (struct tc_buffer_subdata *)previous_call;
2996
2997 return subdata->usage == usage && subdata->resource == resource
2998 && (subdata->offset + subdata->size) == offset;
2999 }
3000
3001 static void
tc_buffer_subdata(struct pipe_context * _pipe,struct pipe_resource * resource,unsigned usage,unsigned offset,unsigned size,const void * data)3002 tc_buffer_subdata(struct pipe_context *_pipe,
3003 struct pipe_resource *resource,
3004 unsigned usage, unsigned offset,
3005 unsigned size, const void *data)
3006 {
3007 struct threaded_context *tc = threaded_context(_pipe);
3008 struct threaded_resource *tres = threaded_resource(resource);
3009
3010 if (!size)
3011 return;
3012
3013 usage |= PIPE_MAP_WRITE;
3014
3015 /* PIPE_MAP_DIRECTLY supresses implicit DISCARD_RANGE. */
3016 if (!(usage & PIPE_MAP_DIRECTLY))
3017 usage |= PIPE_MAP_DISCARD_RANGE;
3018
3019 usage = tc_improve_map_buffer_flags(tc, tres, usage, offset, size);
3020
3021 /* Unsychronized and big transfers should use transfer_map. Also handle
3022 * full invalidations, because drivers aren't allowed to do them.
3023 */
3024 if (usage & (PIPE_MAP_UNSYNCHRONIZED |
3025 PIPE_MAP_DISCARD_WHOLE_RESOURCE) ||
3026 size > TC_MAX_SUBDATA_BYTES ||
3027 tres->cpu_storage) {
3028 struct pipe_transfer *transfer;
3029 struct pipe_box box;
3030 uint8_t *map = NULL;
3031
3032 u_box_1d(offset, size, &box);
3033
3034 /* CPU storage is only useful for partial updates. It can add overhead
3035 * on glBufferData calls so avoid using it.
3036 */
3037 if (!tres->cpu_storage && offset == 0 && size == resource->width0)
3038 usage |= TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE;
3039
3040 map = tc_buffer_map(_pipe, resource, 0, usage, &box, &transfer);
3041 if (map) {
3042 memcpy(map, data, size);
3043 tc_buffer_unmap(_pipe, transfer);
3044 }
3045 return;
3046 }
3047
3048 util_range_add(&tres->b, &tres->valid_buffer_range, offset, offset + size);
3049
3050 /* We can potentially merge this subdata call with the previous one (if any),
3051 * if the application does a whole-buffer upload piecewise. */
3052 {
3053 struct tc_call_base *last_call = tc_get_last_mergeable_call(tc);
3054 struct tc_buffer_subdata *merge_dest = (struct tc_buffer_subdata *)last_call;
3055
3056 if (is_mergeable_buffer_subdata(last_call, usage, offset, resource) &&
3057 tc_enlarge_last_mergeable_call(tc, call_size_with_slots(tc_buffer_subdata, merge_dest->size + size))) {
3058 memcpy(merge_dest->slot + merge_dest->size, data, size);
3059 merge_dest->size += size;
3060
3061 /* TODO: We *could* do an invalidate + upload here if we detect that
3062 * the merged subdata call overwrites the entire buffer. However, that's
3063 * a little complicated since we can't add further calls to our batch
3064 * until we have removed the merged subdata call, which means that
3065 * calling tc_invalidate_buffer before we have removed the call will
3066 * blow things up.
3067 *
3068 * Just leave a large, merged subdata call in the batch for now, which is
3069 * at least better than tons of tiny subdata calls.
3070 */
3071
3072 return;
3073 }
3074 }
3075
3076 /* The upload is small. Enqueue it. */
3077 struct tc_buffer_subdata *p =
3078 tc_add_slot_based_call(tc, TC_CALL_buffer_subdata, tc_buffer_subdata, size);
3079
3080 tc_set_resource_reference(&p->resource, resource);
3081 /* This is will always be busy because if it wasn't, tc_improve_map_buffer-
3082 * _flags would set UNSYNCHRONIZED and we wouldn't get here.
3083 */
3084 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], resource);
3085 p->usage = usage;
3086 p->offset = offset;
3087 p->size = size;
3088 memcpy(p->slot, data, size);
3089
3090 tc_mark_call_mergeable(tc, &p->base);
3091 }
3092
3093 struct tc_texture_subdata {
3094 struct tc_call_base base;
3095 unsigned level, usage, stride;
3096 struct pipe_box box;
3097 struct pipe_resource *resource;
3098 uintptr_t layer_stride;
3099 char slot[0]; /* more will be allocated if needed */
3100 };
3101
3102 static uint16_t ALWAYS_INLINE
tc_call_texture_subdata(struct pipe_context * pipe,void * call)3103 tc_call_texture_subdata(struct pipe_context *pipe, void *call)
3104 {
3105 struct tc_texture_subdata *p = (struct tc_texture_subdata *)call;
3106
3107 pipe->texture_subdata(pipe, p->resource, p->level, p->usage, &p->box,
3108 p->slot, p->stride, p->layer_stride);
3109 tc_drop_resource_reference(p->resource);
3110 return p->base.num_slots;
3111 }
3112
3113 static void
tc_texture_subdata(struct pipe_context * _pipe,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,const void * data,unsigned stride,uintptr_t layer_stride)3114 tc_texture_subdata(struct pipe_context *_pipe,
3115 struct pipe_resource *resource,
3116 unsigned level, unsigned usage,
3117 const struct pipe_box *box,
3118 const void *data, unsigned stride,
3119 uintptr_t layer_stride)
3120 {
3121 struct threaded_context *tc = threaded_context(_pipe);
3122 uint64_t size;
3123
3124 assert(box->height >= 1);
3125 assert(box->depth >= 1);
3126
3127 size = (box->depth - 1) * layer_stride +
3128 (box->height - 1) * (uint64_t)stride +
3129 box->width * util_format_get_blocksize(resource->format);
3130 if (!size)
3131 return;
3132
3133 /* Small uploads can be enqueued, big uploads must sync. */
3134 if (size <= TC_MAX_SUBDATA_BYTES) {
3135 struct tc_texture_subdata *p =
3136 tc_add_slot_based_call(tc, TC_CALL_texture_subdata, tc_texture_subdata, size);
3137
3138 tc_set_resource_batch_usage(tc, resource);
3139 tc_set_resource_reference(&p->resource, resource);
3140 p->level = level;
3141 p->usage = usage;
3142 p->box = *box;
3143 p->stride = stride;
3144 p->layer_stride = layer_stride;
3145 memcpy(p->slot, data, size);
3146 } else {
3147 struct pipe_context *pipe = tc->pipe;
3148 struct threaded_resource *tres = threaded_resource(resource);
3149 unsigned unsync_usage = TC_TRANSFER_MAP_THREADED_UNSYNC | PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_WRITE;
3150 bool can_unsync = !tc_resource_batch_usage_test_busy(tc, resource) &&
3151 tc->options.is_resource_busy &&
3152 !tc->options.is_resource_busy(tc->pipe->screen, tres->latest, usage | unsync_usage);
3153
3154 if (!can_unsync && resource->usage != PIPE_USAGE_STAGING &&
3155 tc->options.parse_renderpass_info && tc->in_renderpass) {
3156 enum pipe_format format = resource->format;
3157 if (usage & PIPE_MAP_DEPTH_ONLY)
3158 format = util_format_get_depth_only(format);
3159 else if (usage & PIPE_MAP_STENCIL_ONLY)
3160 format = PIPE_FORMAT_S8_UINT;
3161
3162 unsigned fmt_stride = util_format_get_stride(format, box->width);
3163 uint64_t fmt_layer_stride = util_format_get_2d_size(format, stride, box->height);
3164 assert(fmt_layer_stride * box->depth <= UINT32_MAX);
3165
3166 struct pipe_resource *pres = pipe_buffer_create(pipe->screen, 0, PIPE_USAGE_STREAM, layer_stride * box->depth);
3167 pipe->buffer_subdata(pipe, pres, unsync_usage, 0, layer_stride * box->depth, data);
3168 struct pipe_box src_box = *box;
3169 src_box.x = src_box.y = src_box.z = 0;
3170
3171 if (fmt_stride == stride && fmt_layer_stride == layer_stride) {
3172 /* if stride matches, single copy is fine*/
3173 tc->base.resource_copy_region(&tc->base, resource, level, box->x, box->y, box->z, pres, 0, &src_box);
3174 } else {
3175 /* if stride doesn't match, inline util_copy_box on the GPU and assume the driver will optimize */
3176 src_box.depth = 1;
3177 for (unsigned z = 0; z < box->depth; ++z, src_box.x = z * layer_stride) {
3178 unsigned dst_x = box->x, dst_y = box->y, width = box->width, height = box->height, dst_z = box->z + z;
3179 int blocksize = util_format_get_blocksize(format);
3180 int blockwidth = util_format_get_blockwidth(format);
3181 int blockheight = util_format_get_blockheight(format);
3182
3183 assert(blocksize > 0);
3184 assert(blockwidth > 0);
3185 assert(blockheight > 0);
3186
3187 dst_x /= blockwidth;
3188 dst_y /= blockheight;
3189 width = DIV_ROUND_UP(width, blockwidth);
3190 height = DIV_ROUND_UP(height, blockheight);
3191
3192 width *= blocksize;
3193
3194 if (width == fmt_stride && width == (unsigned)stride) {
3195 ASSERTED uint64_t size = (uint64_t)height * width;
3196
3197 assert(size <= SIZE_MAX);
3198 assert(dst_x + src_box.width < u_minify(pres->width0, level));
3199 assert(dst_y + src_box.height < u_minify(pres->height0, level));
3200 assert(pres->target != PIPE_TEXTURE_3D || z + src_box.depth < u_minify(pres->depth0, level));
3201 tc->base.resource_copy_region(&tc->base, resource, level, dst_x, dst_y, dst_z, pres, 0, &src_box);
3202 } else {
3203 src_box.height = 1;
3204 for (unsigned i = 0; i < height; i++, dst_y++, src_box.x += stride)
3205 tc->base.resource_copy_region(&tc->base, resource, level, dst_x, dst_y, dst_z, pres, 0, &src_box);
3206 }
3207 }
3208 }
3209
3210 pipe_resource_reference(&pres, NULL);
3211 } else {
3212 if (can_unsync) {
3213 usage |= unsync_usage;
3214 } else {
3215 tc_sync(tc);
3216 tc_set_driver_thread(tc);
3217 }
3218 pipe->texture_subdata(pipe, resource, level, usage, box, data,
3219 stride, layer_stride);
3220 if (!can_unsync)
3221 tc_clear_driver_thread(tc);
3222 }
3223 }
3224 }
3225
3226
3227 /********************************************************************
3228 * miscellaneous
3229 */
3230
3231 #define TC_FUNC_SYNC_RET0(ret_type, func) \
3232 static ret_type \
3233 tc_##func(struct pipe_context *_pipe) \
3234 { \
3235 struct threaded_context *tc = threaded_context(_pipe); \
3236 struct pipe_context *pipe = tc->pipe; \
3237 tc_sync(tc); \
3238 return pipe->func(pipe); \
3239 }
3240
TC_FUNC_SYNC_RET0(uint64_t,get_timestamp)3241 TC_FUNC_SYNC_RET0(uint64_t, get_timestamp)
3242
3243 static void
3244 tc_get_sample_position(struct pipe_context *_pipe,
3245 unsigned sample_count, unsigned sample_index,
3246 float *out_value)
3247 {
3248 struct threaded_context *tc = threaded_context(_pipe);
3249 struct pipe_context *pipe = tc->pipe;
3250
3251 pipe->get_sample_position(pipe, sample_count, sample_index,
3252 out_value);
3253 }
3254
3255 static enum pipe_reset_status
tc_get_device_reset_status(struct pipe_context * _pipe)3256 tc_get_device_reset_status(struct pipe_context *_pipe)
3257 {
3258 struct threaded_context *tc = threaded_context(_pipe);
3259 struct pipe_context *pipe = tc->pipe;
3260
3261 if (!tc->options.unsynchronized_get_device_reset_status)
3262 tc_sync(tc);
3263
3264 return pipe->get_device_reset_status(pipe);
3265 }
3266
3267 static void
tc_set_device_reset_callback(struct pipe_context * _pipe,const struct pipe_device_reset_callback * cb)3268 tc_set_device_reset_callback(struct pipe_context *_pipe,
3269 const struct pipe_device_reset_callback *cb)
3270 {
3271 struct threaded_context *tc = threaded_context(_pipe);
3272 struct pipe_context *pipe = tc->pipe;
3273
3274 tc_sync(tc);
3275 pipe->set_device_reset_callback(pipe, cb);
3276 }
3277
3278 struct tc_string_marker {
3279 struct tc_call_base base;
3280 int len;
3281 char slot[0]; /* more will be allocated if needed */
3282 };
3283
3284 static uint16_t ALWAYS_INLINE
tc_call_emit_string_marker(struct pipe_context * pipe,void * call)3285 tc_call_emit_string_marker(struct pipe_context *pipe, void *call)
3286 {
3287 struct tc_string_marker *p = (struct tc_string_marker *)call;
3288 pipe->emit_string_marker(pipe, p->slot, p->len);
3289 return p->base.num_slots;
3290 }
3291
3292 static void
tc_emit_string_marker(struct pipe_context * _pipe,const char * string,int len)3293 tc_emit_string_marker(struct pipe_context *_pipe,
3294 const char *string, int len)
3295 {
3296 struct threaded_context *tc = threaded_context(_pipe);
3297
3298 if (len <= TC_MAX_STRING_MARKER_BYTES) {
3299 struct tc_string_marker *p =
3300 tc_add_slot_based_call(tc, TC_CALL_emit_string_marker, tc_string_marker, len);
3301
3302 memcpy(p->slot, string, len);
3303 p->len = len;
3304 } else {
3305 struct pipe_context *pipe = tc->pipe;
3306
3307 tc_sync(tc);
3308 tc_set_driver_thread(tc);
3309 pipe->emit_string_marker(pipe, string, len);
3310 tc_clear_driver_thread(tc);
3311 }
3312 }
3313
3314 static void
tc_dump_debug_state(struct pipe_context * _pipe,FILE * stream,unsigned flags)3315 tc_dump_debug_state(struct pipe_context *_pipe, FILE *stream,
3316 unsigned flags)
3317 {
3318 struct threaded_context *tc = threaded_context(_pipe);
3319 struct pipe_context *pipe = tc->pipe;
3320
3321 tc_sync(tc);
3322 pipe->dump_debug_state(pipe, stream, flags);
3323 }
3324
3325 static void
tc_set_debug_callback(struct pipe_context * _pipe,const struct util_debug_callback * cb)3326 tc_set_debug_callback(struct pipe_context *_pipe,
3327 const struct util_debug_callback *cb)
3328 {
3329 struct threaded_context *tc = threaded_context(_pipe);
3330 struct pipe_context *pipe = tc->pipe;
3331
3332 tc_sync(tc);
3333
3334 /* Drop all synchronous debug callbacks. Drivers are expected to be OK
3335 * with this. shader-db will use an environment variable to disable
3336 * the threaded context.
3337 */
3338 if (cb && !cb->async)
3339 pipe->set_debug_callback(pipe, NULL);
3340 else
3341 pipe->set_debug_callback(pipe, cb);
3342 }
3343
3344 static void
tc_set_log_context(struct pipe_context * _pipe,struct u_log_context * log)3345 tc_set_log_context(struct pipe_context *_pipe, struct u_log_context *log)
3346 {
3347 struct threaded_context *tc = threaded_context(_pipe);
3348 struct pipe_context *pipe = tc->pipe;
3349
3350 tc_sync(tc);
3351 pipe->set_log_context(pipe, log);
3352 }
3353
3354 static void
tc_create_fence_fd(struct pipe_context * _pipe,struct pipe_fence_handle ** fence,int fd,enum pipe_fd_type type)3355 tc_create_fence_fd(struct pipe_context *_pipe,
3356 struct pipe_fence_handle **fence, int fd,
3357 enum pipe_fd_type type)
3358 {
3359 struct threaded_context *tc = threaded_context(_pipe);
3360 struct pipe_context *pipe = tc->pipe;
3361
3362 if (!tc->options.unsynchronized_create_fence_fd)
3363 tc_sync(tc);
3364
3365 pipe->create_fence_fd(pipe, fence, fd, type);
3366 }
3367
3368 struct tc_fence_call {
3369 struct tc_call_base base;
3370 struct pipe_fence_handle *fence;
3371 };
3372
3373 static uint16_t ALWAYS_INLINE
tc_call_fence_server_sync(struct pipe_context * pipe,void * call)3374 tc_call_fence_server_sync(struct pipe_context *pipe, void *call)
3375 {
3376 struct pipe_fence_handle *fence = to_call(call, tc_fence_call)->fence;
3377
3378 pipe->fence_server_sync(pipe, fence);
3379 pipe->screen->fence_reference(pipe->screen, &fence, NULL);
3380 return call_size(tc_fence_call);
3381 }
3382
3383 static void
tc_fence_server_sync(struct pipe_context * _pipe,struct pipe_fence_handle * fence)3384 tc_fence_server_sync(struct pipe_context *_pipe,
3385 struct pipe_fence_handle *fence)
3386 {
3387 struct threaded_context *tc = threaded_context(_pipe);
3388 struct pipe_screen *screen = tc->pipe->screen;
3389 struct tc_fence_call *call = tc_add_call(tc, TC_CALL_fence_server_sync,
3390 tc_fence_call);
3391
3392 call->fence = NULL;
3393 screen->fence_reference(screen, &call->fence, fence);
3394 }
3395
3396 static void
tc_fence_server_signal(struct pipe_context * _pipe,struct pipe_fence_handle * fence)3397 tc_fence_server_signal(struct pipe_context *_pipe,
3398 struct pipe_fence_handle *fence)
3399 {
3400 struct threaded_context *tc = threaded_context(_pipe);
3401 struct pipe_context *pipe = tc->pipe;
3402 tc_sync(tc);
3403 pipe->fence_server_signal(pipe, fence);
3404 }
3405
3406 static struct pipe_video_codec *
tc_create_video_codec(UNUSED struct pipe_context * _pipe,UNUSED const struct pipe_video_codec * templ)3407 tc_create_video_codec(UNUSED struct pipe_context *_pipe,
3408 UNUSED const struct pipe_video_codec *templ)
3409 {
3410 unreachable("Threaded context should not be enabled for video APIs");
3411 return NULL;
3412 }
3413
3414 static struct pipe_video_buffer *
tc_create_video_buffer(UNUSED struct pipe_context * _pipe,UNUSED const struct pipe_video_buffer * templ)3415 tc_create_video_buffer(UNUSED struct pipe_context *_pipe,
3416 UNUSED const struct pipe_video_buffer *templ)
3417 {
3418 unreachable("Threaded context should not be enabled for video APIs");
3419 return NULL;
3420 }
3421
3422 struct tc_context_param {
3423 struct tc_call_base base;
3424 enum pipe_context_param param;
3425 unsigned value;
3426 };
3427
3428 static uint16_t ALWAYS_INLINE
tc_call_set_context_param(struct pipe_context * pipe,void * call)3429 tc_call_set_context_param(struct pipe_context *pipe, void *call)
3430 {
3431 struct tc_context_param *p = to_call(call, tc_context_param);
3432
3433 if (pipe->set_context_param)
3434 pipe->set_context_param(pipe, p->param, p->value);
3435
3436 return call_size(tc_context_param);
3437 }
3438
3439 static void
tc_set_context_param(struct pipe_context * _pipe,enum pipe_context_param param,unsigned value)3440 tc_set_context_param(struct pipe_context *_pipe,
3441 enum pipe_context_param param,
3442 unsigned value)
3443 {
3444 struct threaded_context *tc = threaded_context(_pipe);
3445
3446 if (param == PIPE_CONTEXT_PARAM_UPDATE_THREAD_SCHEDULING) {
3447 util_thread_sched_apply_policy(tc->queue.threads[0],
3448 UTIL_THREAD_THREADED_CONTEXT, value,
3449 NULL);
3450
3451 /* Execute this immediately (without enqueuing).
3452 * It's required to be thread-safe.
3453 */
3454 struct pipe_context *pipe = tc->pipe;
3455 if (pipe->set_context_param)
3456 pipe->set_context_param(pipe, param, value);
3457 return;
3458 }
3459
3460 if (tc->pipe->set_context_param) {
3461 struct tc_context_param *call =
3462 tc_add_call(tc, TC_CALL_set_context_param, tc_context_param);
3463
3464 call->param = param;
3465 call->value = value;
3466 }
3467 }
3468
3469
3470 /********************************************************************
3471 * draw, launch, clear, blit, copy, flush
3472 */
3473
3474 struct tc_flush_deferred_call {
3475 struct tc_call_base base;
3476 unsigned flags;
3477 struct pipe_fence_handle *fence;
3478 };
3479
3480 struct tc_flush_call {
3481 struct tc_call_base base;
3482 unsigned flags;
3483 struct pipe_fence_handle *fence;
3484 struct threaded_context *tc;
3485 };
3486
3487 static void
tc_flush_queries(struct threaded_context * tc)3488 tc_flush_queries(struct threaded_context *tc)
3489 {
3490 struct threaded_query *tq, *tmp;
3491 LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) {
3492 list_del(&tq->head_unflushed);
3493
3494 /* Memory release semantics: due to a possible race with
3495 * tc_get_query_result, we must ensure that the linked list changes
3496 * are visible before setting tq->flushed.
3497 */
3498 p_atomic_set(&tq->flushed, true);
3499 }
3500 }
3501
3502 static uint16_t ALWAYS_INLINE
tc_call_flush_deferred(struct pipe_context * pipe,void * call)3503 tc_call_flush_deferred(struct pipe_context *pipe, void *call)
3504 {
3505 struct tc_flush_deferred_call *p = to_call(call, tc_flush_deferred_call);
3506 struct pipe_screen *screen = pipe->screen;
3507
3508 pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags);
3509 screen->fence_reference(screen, &p->fence, NULL);
3510
3511 return call_size(tc_flush_deferred_call);
3512 }
3513
3514 static uint16_t ALWAYS_INLINE
tc_call_flush(struct pipe_context * pipe,void * call)3515 tc_call_flush(struct pipe_context *pipe, void *call)
3516 {
3517 struct tc_flush_call *p = to_call(call, tc_flush_call);
3518 struct pipe_screen *screen = pipe->screen;
3519
3520 pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags);
3521 screen->fence_reference(screen, &p->fence, NULL);
3522
3523 tc_flush_queries(p->tc);
3524
3525 return call_size(tc_flush_call);
3526 }
3527
3528 static void
tc_flush(struct pipe_context * _pipe,struct pipe_fence_handle ** fence,unsigned flags)3529 tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
3530 unsigned flags)
3531 {
3532 struct threaded_context *tc = threaded_context(_pipe);
3533 struct pipe_context *pipe = tc->pipe;
3534 struct pipe_screen *screen = pipe->screen;
3535 bool async = flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC);
3536 bool deferred = (flags & PIPE_FLUSH_DEFERRED) > 0;
3537
3538 if (!deferred || !fence)
3539 tc->in_renderpass = false;
3540
3541 if (async && tc->options.create_fence) {
3542 if (fence) {
3543 struct tc_batch *next = &tc->batch_slots[tc->next];
3544
3545 if (!next->token) {
3546 next->token = malloc(sizeof(*next->token));
3547 if (!next->token)
3548 goto out_of_memory;
3549
3550 pipe_reference_init(&next->token->ref, 1);
3551 next->token->tc = tc;
3552 }
3553
3554 screen->fence_reference(screen, fence,
3555 tc->options.create_fence(pipe, next->token));
3556 if (!*fence)
3557 goto out_of_memory;
3558 }
3559
3560 struct tc_flush_call *p;
3561 if (deferred) {
3562 /* these have identical fields */
3563 p = (struct tc_flush_call *)tc_add_call(tc, TC_CALL_flush_deferred, tc_flush_deferred_call);
3564 } else {
3565 p = tc_add_call(tc, TC_CALL_flush, tc_flush_call);
3566 p->tc = tc;
3567 }
3568 p->fence = fence ? *fence : NULL;
3569 p->flags = flags | TC_FLUSH_ASYNC;
3570
3571 if (!deferred) {
3572 /* non-deferred async flushes indicate completion of existing renderpass info */
3573 tc_signal_renderpass_info_ready(tc);
3574 tc_batch_flush(tc, false);
3575 tc->seen_fb_state = false;
3576 }
3577
3578 return;
3579 }
3580
3581 out_of_memory:
3582 tc->flushing = true;
3583 /* renderpass info is signaled during sync */
3584 tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
3585 flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
3586
3587 if (!deferred) {
3588 tc_flush_queries(tc);
3589 tc->seen_fb_state = false;
3590 tc->query_ended = false;
3591 }
3592 tc_set_driver_thread(tc);
3593 pipe->flush(pipe, fence, flags);
3594 tc_clear_driver_thread(tc);
3595 tc->flushing = false;
3596 }
3597
3598 struct tc_draw_single_drawid {
3599 struct tc_draw_single base;
3600 unsigned drawid_offset;
3601 };
3602
3603 static uint16_t ALWAYS_INLINE
tc_call_draw_single_drawid(struct pipe_context * pipe,void * call)3604 tc_call_draw_single_drawid(struct pipe_context *pipe, void *call)
3605 {
3606 struct tc_draw_single_drawid *info_drawid = to_call(call, tc_draw_single_drawid);
3607 struct tc_draw_single *info = &info_drawid->base;
3608
3609 /* u_threaded_context stores start/count in min/max_index for single draws. */
3610 /* Drivers using u_threaded_context shouldn't use min/max_index. */
3611 struct pipe_draw_start_count_bias draw;
3612
3613 draw.start = info->info.min_index;
3614 draw.count = info->info.max_index;
3615 draw.index_bias = info->index_bias;
3616
3617 info->info.index_bounds_valid = false;
3618 info->info.has_user_indices = false;
3619 info->info.take_index_buffer_ownership = false;
3620
3621 pipe->draw_vbo(pipe, &info->info, info_drawid->drawid_offset, NULL, &draw, 1);
3622 if (info->info.index_size)
3623 tc_drop_resource_reference(info->info.index.resource);
3624
3625 return call_size(tc_draw_single_drawid);
3626 }
3627
3628 static void
simplify_draw_info(struct pipe_draw_info * info)3629 simplify_draw_info(struct pipe_draw_info *info)
3630 {
3631 /* Clear these fields to facilitate draw merging.
3632 * Drivers shouldn't use them.
3633 */
3634 info->has_user_indices = false;
3635 info->index_bounds_valid = false;
3636 info->take_index_buffer_ownership = false;
3637 info->index_bias_varies = false;
3638 info->_pad = 0;
3639
3640 /* This shouldn't be set when merging single draws. */
3641 info->increment_draw_id = false;
3642
3643 if (info->index_size) {
3644 if (!info->primitive_restart)
3645 info->restart_index = 0;
3646 } else {
3647 assert(!info->primitive_restart);
3648 info->primitive_restart = false;
3649 info->restart_index = 0;
3650 info->index.resource = NULL;
3651 }
3652 }
3653
3654 static bool
is_next_call_a_mergeable_draw(struct tc_draw_single * first,struct tc_draw_single * next)3655 is_next_call_a_mergeable_draw(struct tc_draw_single *first,
3656 struct tc_draw_single *next)
3657 {
3658 if (next->base.call_id != TC_CALL_draw_single)
3659 return false;
3660
3661 STATIC_ASSERT(offsetof(struct pipe_draw_info, min_index) ==
3662 sizeof(struct pipe_draw_info) - 8);
3663 STATIC_ASSERT(offsetof(struct pipe_draw_info, max_index) ==
3664 sizeof(struct pipe_draw_info) - 4);
3665 /* All fields must be the same except start and count. */
3666 /* u_threaded_context stores start/count in min/max_index for single draws. */
3667 return memcmp((uint32_t*)&first->info, (uint32_t*)&next->info,
3668 DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX) == 0;
3669 }
3670
3671 static uint16_t ALWAYS_INLINE
tc_call_draw_single(struct pipe_context * pipe,void * call)3672 tc_call_draw_single(struct pipe_context *pipe, void *call)
3673 {
3674 /* Draw call merging. */
3675 struct tc_draw_single *first = to_call(call, tc_draw_single);
3676 struct tc_draw_single *next = get_next_call(first, tc_draw_single);
3677
3678 /* If at least 2 consecutive draw calls can be merged... */
3679 if (next->base.call_id == TC_CALL_draw_single) {
3680 if (is_next_call_a_mergeable_draw(first, next)) {
3681 /* The maximum number of merged draws is given by the batch size. */
3682 struct pipe_draw_start_count_bias multi[TC_SLOTS_PER_BATCH / call_size(tc_draw_single)];
3683 unsigned num_draws = 2;
3684 bool index_bias_varies = first->index_bias != next->index_bias;
3685
3686 /* u_threaded_context stores start/count in min/max_index for single draws. */
3687 multi[0].start = first->info.min_index;
3688 multi[0].count = first->info.max_index;
3689 multi[0].index_bias = first->index_bias;
3690 multi[1].start = next->info.min_index;
3691 multi[1].count = next->info.max_index;
3692 multi[1].index_bias = next->index_bias;
3693
3694 /* Find how many other draws can be merged. */
3695 next = get_next_call(next, tc_draw_single);
3696 for (; is_next_call_a_mergeable_draw(first, next);
3697 next = get_next_call(next, tc_draw_single), num_draws++) {
3698 /* u_threaded_context stores start/count in min/max_index for single draws. */
3699 multi[num_draws].start = next->info.min_index;
3700 multi[num_draws].count = next->info.max_index;
3701 multi[num_draws].index_bias = next->index_bias;
3702 index_bias_varies |= first->index_bias != next->index_bias;
3703 }
3704
3705 first->info.index_bias_varies = index_bias_varies;
3706 pipe->draw_vbo(pipe, &first->info, 0, NULL, multi, num_draws);
3707
3708 /* Since all draws use the same index buffer, drop all references at once. */
3709 if (first->info.index_size)
3710 pipe_drop_resource_references(first->info.index.resource, num_draws);
3711
3712 return call_size(tc_draw_single) * num_draws;
3713 }
3714 }
3715
3716 /* u_threaded_context stores start/count in min/max_index for single draws. */
3717 /* Drivers using u_threaded_context shouldn't use min/max_index. */
3718 struct pipe_draw_start_count_bias draw;
3719
3720 draw.start = first->info.min_index;
3721 draw.count = first->info.max_index;
3722 draw.index_bias = first->index_bias;
3723
3724 first->info.index_bounds_valid = false;
3725 first->info.has_user_indices = false;
3726 first->info.take_index_buffer_ownership = false;
3727
3728 pipe->draw_vbo(pipe, &first->info, 0, NULL, &draw, 1);
3729 if (first->info.index_size)
3730 tc_drop_resource_reference(first->info.index.resource);
3731
3732 return call_size(tc_draw_single);
3733 }
3734
3735 struct tc_draw_indirect {
3736 struct tc_call_base base;
3737 struct pipe_draw_start_count_bias draw;
3738 struct pipe_draw_info info;
3739 struct pipe_draw_indirect_info indirect;
3740 };
3741
3742 static uint16_t ALWAYS_INLINE
tc_call_draw_indirect(struct pipe_context * pipe,void * call)3743 tc_call_draw_indirect(struct pipe_context *pipe, void *call)
3744 {
3745 struct tc_draw_indirect *info = to_call(call, tc_draw_indirect);
3746
3747 info->info.index_bounds_valid = false;
3748 info->info.take_index_buffer_ownership = false;
3749
3750 pipe->draw_vbo(pipe, &info->info, 0, &info->indirect, &info->draw, 1);
3751 if (info->info.index_size)
3752 tc_drop_resource_reference(info->info.index.resource);
3753
3754 tc_drop_resource_reference(info->indirect.buffer);
3755 tc_drop_resource_reference(info->indirect.indirect_draw_count);
3756 tc_drop_so_target_reference(info->indirect.count_from_stream_output);
3757 return call_size(tc_draw_indirect);
3758 }
3759
3760 struct tc_draw_multi {
3761 struct tc_call_base base;
3762 unsigned num_draws;
3763 struct pipe_draw_info info;
3764 struct pipe_draw_start_count_bias slot[]; /* variable-sized array */
3765 };
3766
3767 static uint16_t ALWAYS_INLINE
tc_call_draw_multi(struct pipe_context * pipe,void * call)3768 tc_call_draw_multi(struct pipe_context *pipe, void *call)
3769 {
3770 struct tc_draw_multi *info = (struct tc_draw_multi*)call;
3771
3772 info->info.has_user_indices = false;
3773 info->info.index_bounds_valid = false;
3774 info->info.take_index_buffer_ownership = false;
3775
3776 pipe->draw_vbo(pipe, &info->info, 0, NULL, info->slot, info->num_draws);
3777 if (info->info.index_size)
3778 tc_drop_resource_reference(info->info.index.resource);
3779
3780 return info->base.num_slots;
3781 }
3782
3783 #define DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX \
3784 offsetof(struct pipe_draw_info, index)
3785
3786 /* Single draw with drawid_offset == 0. */
3787 static void
tc_draw_single(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3788 tc_draw_single(struct pipe_context *_pipe, const struct pipe_draw_info *info,
3789 unsigned drawid_offset,
3790 const struct pipe_draw_indirect_info *indirect,
3791 const struct pipe_draw_start_count_bias *draws,
3792 unsigned num_draws)
3793 {
3794 struct threaded_context *tc = threaded_context(_pipe);
3795 struct tc_draw_single *p =
3796 tc_add_call(tc, TC_CALL_draw_single, tc_draw_single);
3797
3798 if (info->index_size) {
3799 if (!info->take_index_buffer_ownership) {
3800 tc_set_resource_reference(&p->info.index.resource,
3801 info->index.resource);
3802 }
3803 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], info->index.resource);
3804 }
3805 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
3806 /* u_threaded_context stores start/count in min/max_index for single draws. */
3807 p->info.min_index = draws[0].start;
3808 p->info.max_index = draws[0].count;
3809 p->index_bias = draws[0].index_bias;
3810 simplify_draw_info(&p->info);
3811 }
3812
3813 /* Single draw with drawid_offset > 0. */
3814 static void
tc_draw_single_draw_id(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3815 tc_draw_single_draw_id(struct pipe_context *_pipe,
3816 const struct pipe_draw_info *info,
3817 unsigned drawid_offset,
3818 const struct pipe_draw_indirect_info *indirect,
3819 const struct pipe_draw_start_count_bias *draws,
3820 unsigned num_draws)
3821 {
3822 struct threaded_context *tc = threaded_context(_pipe);
3823 struct tc_draw_single *p =
3824 &tc_add_call(tc, TC_CALL_draw_single_drawid, tc_draw_single_drawid)->base;
3825
3826 if (info->index_size) {
3827 if (!info->take_index_buffer_ownership) {
3828 tc_set_resource_reference(&p->info.index.resource,
3829 info->index.resource);
3830 }
3831 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], info->index.resource);
3832 }
3833 ((struct tc_draw_single_drawid*)p)->drawid_offset = drawid_offset;
3834 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
3835 /* u_threaded_context stores start/count in min/max_index for single draws. */
3836 p->info.min_index = draws[0].start;
3837 p->info.max_index = draws[0].count;
3838 p->index_bias = draws[0].index_bias;
3839 simplify_draw_info(&p->info);
3840 }
3841
3842 /* Single draw with user indices and drawid_offset == 0. */
3843 static void
tc_draw_user_indices_single(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3844 tc_draw_user_indices_single(struct pipe_context *_pipe,
3845 const struct pipe_draw_info *info,
3846 unsigned drawid_offset,
3847 const struct pipe_draw_indirect_info *indirect,
3848 const struct pipe_draw_start_count_bias *draws,
3849 unsigned num_draws)
3850 {
3851 struct threaded_context *tc = threaded_context(_pipe);
3852 unsigned index_size = info->index_size;
3853 unsigned size = draws[0].count * index_size;
3854 struct pipe_resource *buffer = NULL;
3855 unsigned offset;
3856
3857 if (!size)
3858 return;
3859
3860 /* This must be done before adding draw_vbo, because it could generate
3861 * e.g. transfer_unmap and flush partially-uninitialized draw_vbo
3862 * to the driver if it was done afterwards.
3863 */
3864 u_upload_data(tc->base.stream_uploader, 0, size, 4,
3865 (uint8_t*)info->index.user + draws[0].start * index_size,
3866 &offset, &buffer);
3867 if (unlikely(!buffer))
3868 return;
3869
3870 struct tc_draw_single *p =
3871 tc_add_call(tc, TC_CALL_draw_single, tc_draw_single);
3872 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX);
3873 p->info.index.resource = buffer;
3874 /* u_threaded_context stores start/count in min/max_index for single draws. */
3875 p->info.min_index = offset >> util_logbase2(index_size);
3876 p->info.max_index = draws[0].count;
3877 p->index_bias = draws[0].index_bias;
3878 simplify_draw_info(&p->info);
3879 }
3880
3881 /* Single draw with user indices and drawid_offset > 0. */
3882 static void
tc_draw_user_indices_single_draw_id(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3883 tc_draw_user_indices_single_draw_id(struct pipe_context *_pipe,
3884 const struct pipe_draw_info *info,
3885 unsigned drawid_offset,
3886 const struct pipe_draw_indirect_info *indirect,
3887 const struct pipe_draw_start_count_bias *draws,
3888 unsigned num_draws)
3889 {
3890 struct threaded_context *tc = threaded_context(_pipe);
3891 unsigned index_size = info->index_size;
3892 unsigned size = draws[0].count * index_size;
3893 struct pipe_resource *buffer = NULL;
3894 unsigned offset;
3895
3896 if (!size)
3897 return;
3898
3899 /* This must be done before adding draw_vbo, because it could generate
3900 * e.g. transfer_unmap and flush partially-uninitialized draw_vbo
3901 * to the driver if it was done afterwards.
3902 */
3903 u_upload_data(tc->base.stream_uploader, 0, size, 4,
3904 (uint8_t*)info->index.user + draws[0].start * index_size,
3905 &offset, &buffer);
3906 if (unlikely(!buffer))
3907 return;
3908
3909 struct tc_draw_single *p =
3910 &tc_add_call(tc, TC_CALL_draw_single_drawid, tc_draw_single_drawid)->base;
3911 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX);
3912 p->info.index.resource = buffer;
3913 ((struct tc_draw_single_drawid*)p)->drawid_offset = drawid_offset;
3914 /* u_threaded_context stores start/count in min/max_index for single draws. */
3915 p->info.min_index = offset >> util_logbase2(index_size);
3916 p->info.max_index = draws[0].count;
3917 p->index_bias = draws[0].index_bias;
3918 simplify_draw_info(&p->info);
3919 }
3920
3921 #define DRAW_OVERHEAD_BYTES sizeof(struct tc_draw_multi)
3922 #define ONE_DRAW_SLOT_BYTES sizeof(((struct tc_draw_multi*)NULL)->slot[0])
3923
3924 #define SLOTS_FOR_ONE_DRAW \
3925 DIV_ROUND_UP(DRAW_OVERHEAD_BYTES + ONE_DRAW_SLOT_BYTES, \
3926 sizeof(struct tc_call_base))
3927
3928 static void
tc_draw_multi(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3929 tc_draw_multi(struct pipe_context *_pipe, const struct pipe_draw_info *info,
3930 unsigned drawid_offset,
3931 const struct pipe_draw_indirect_info *indirect,
3932 const struct pipe_draw_start_count_bias *draws,
3933 unsigned num_draws)
3934 {
3935 struct threaded_context *tc = threaded_context(_pipe);
3936 int total_offset = 0;
3937 bool take_index_buffer_ownership = info->take_index_buffer_ownership;
3938
3939 while (num_draws) {
3940 struct tc_batch *next = &tc->batch_slots[tc->next];
3941
3942 int nb_slots_left = TC_SLOTS_PER_BATCH - 1 - next->num_total_slots;
3943 /* If there isn't enough place for one draw, try to fill the next one */
3944 if (nb_slots_left < SLOTS_FOR_ONE_DRAW)
3945 nb_slots_left = TC_SLOTS_PER_BATCH - 1;
3946 const int size_left_bytes = nb_slots_left * sizeof(struct tc_call_base);
3947
3948 /* How many draws can we fit in the current batch */
3949 const int dr = MIN2(num_draws, (size_left_bytes - DRAW_OVERHEAD_BYTES) /
3950 ONE_DRAW_SLOT_BYTES);
3951
3952 /* Non-indexed call or indexed with a real index buffer. */
3953 struct tc_draw_multi *p =
3954 tc_add_slot_based_call(tc, TC_CALL_draw_multi, tc_draw_multi,
3955 dr);
3956 if (info->index_size) {
3957 if (!take_index_buffer_ownership) {
3958 tc_set_resource_reference(&p->info.index.resource,
3959 info->index.resource);
3960 }
3961 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], info->index.resource);
3962 }
3963 take_index_buffer_ownership = false;
3964 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
3965 p->num_draws = dr;
3966 memcpy(p->slot, &draws[total_offset], sizeof(draws[0]) * dr);
3967 num_draws -= dr;
3968
3969 total_offset += dr;
3970 }
3971 }
3972
3973 static void
tc_draw_user_indices_multi(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3974 tc_draw_user_indices_multi(struct pipe_context *_pipe,
3975 const struct pipe_draw_info *info,
3976 unsigned drawid_offset,
3977 const struct pipe_draw_indirect_info *indirect,
3978 const struct pipe_draw_start_count_bias *draws,
3979 unsigned num_draws)
3980 {
3981 struct threaded_context *tc = threaded_context(_pipe);
3982 struct pipe_resource *buffer = NULL;
3983 unsigned buffer_offset, total_count = 0;
3984 unsigned index_size_shift = util_logbase2(info->index_size);
3985 uint8_t *ptr = NULL;
3986
3987 /* Get the total count. */
3988 for (unsigned i = 0; i < num_draws; i++)
3989 total_count += draws[i].count;
3990
3991 if (!total_count)
3992 return;
3993
3994 /* Allocate space for all index buffers.
3995 *
3996 * This must be done before adding draw_vbo, because it could generate
3997 * e.g. transfer_unmap and flush partially-uninitialized draw_vbo
3998 * to the driver if it was done afterwards.
3999 */
4000 u_upload_alloc(tc->base.stream_uploader, 0,
4001 total_count << index_size_shift, 4,
4002 &buffer_offset, &buffer, (void**)&ptr);
4003 if (unlikely(!buffer))
4004 return;
4005
4006 int total_offset = 0;
4007 unsigned offset = 0;
4008 while (num_draws) {
4009 struct tc_batch *next = &tc->batch_slots[tc->next];
4010
4011 int nb_slots_left = TC_SLOTS_PER_BATCH - 1 - next->num_total_slots;
4012 /* If there isn't enough place for one draw, try to fill the next one */
4013 if (nb_slots_left < SLOTS_FOR_ONE_DRAW)
4014 nb_slots_left = TC_SLOTS_PER_BATCH - 1;
4015 const int size_left_bytes = nb_slots_left * sizeof(struct tc_call_base);
4016
4017 /* How many draws can we fit in the current batch */
4018 const int dr = MIN2(num_draws, (size_left_bytes - DRAW_OVERHEAD_BYTES) /
4019 ONE_DRAW_SLOT_BYTES);
4020
4021 struct tc_draw_multi *p =
4022 tc_add_slot_based_call(tc, TC_CALL_draw_multi, tc_draw_multi,
4023 dr);
4024 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX);
4025
4026 if (total_offset == 0)
4027 /* the first slot inherits the reference from u_upload_alloc() */
4028 p->info.index.resource = buffer;
4029 else
4030 /* all following slots need a new reference */
4031 tc_set_resource_reference(&p->info.index.resource, buffer);
4032
4033 p->num_draws = dr;
4034
4035 /* Upload index buffers. */
4036 for (unsigned i = 0; i < dr; i++) {
4037 unsigned count = draws[i + total_offset].count;
4038
4039 if (!count) {
4040 p->slot[i].start = 0;
4041 p->slot[i].count = 0;
4042 p->slot[i].index_bias = 0;
4043 continue;
4044 }
4045
4046 unsigned size = count << index_size_shift;
4047 memcpy(ptr + offset,
4048 (uint8_t*)info->index.user +
4049 (draws[i + total_offset].start << index_size_shift), size);
4050 p->slot[i].start = (buffer_offset + offset) >> index_size_shift;
4051 p->slot[i].count = count;
4052 p->slot[i].index_bias = draws[i + total_offset].index_bias;
4053 offset += size;
4054 }
4055
4056 total_offset += dr;
4057 num_draws -= dr;
4058 }
4059 }
4060
4061 static void
tc_draw_indirect(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4062 tc_draw_indirect(struct pipe_context *_pipe, const struct pipe_draw_info *info,
4063 unsigned drawid_offset,
4064 const struct pipe_draw_indirect_info *indirect,
4065 const struct pipe_draw_start_count_bias *draws,
4066 unsigned num_draws)
4067 {
4068 struct threaded_context *tc = threaded_context(_pipe);
4069 assert(!info->has_user_indices);
4070 assert(num_draws == 1);
4071
4072 struct tc_draw_indirect *p =
4073 tc_add_call(tc, TC_CALL_draw_indirect, tc_draw_indirect);
4074 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
4075
4076 if (info->index_size) {
4077 if (!info->take_index_buffer_ownership) {
4078 tc_set_resource_reference(&p->info.index.resource,
4079 info->index.resource);
4080 }
4081 tc_add_to_buffer_list(next, info->index.resource);
4082 }
4083 memcpy(&p->info, info, DRAW_INFO_SIZE_WITHOUT_MIN_MAX_INDEX);
4084
4085 tc_set_resource_reference(&p->indirect.buffer, indirect->buffer);
4086 tc_set_resource_reference(&p->indirect.indirect_draw_count,
4087 indirect->indirect_draw_count);
4088 p->indirect.count_from_stream_output = NULL;
4089 pipe_so_target_reference(&p->indirect.count_from_stream_output,
4090 indirect->count_from_stream_output);
4091
4092 if (indirect->buffer)
4093 tc_add_to_buffer_list(next, indirect->buffer);
4094 if (indirect->indirect_draw_count)
4095 tc_add_to_buffer_list(next, indirect->indirect_draw_count);
4096 if (indirect->count_from_stream_output)
4097 tc_add_to_buffer_list(next, indirect->count_from_stream_output->buffer);
4098
4099 memcpy(&p->indirect, indirect, sizeof(*indirect));
4100 p->draw.start = draws[0].start;
4101 }
4102
4103 /* Dispatch table for tc_draw_vbo:
4104 *
4105 * Indexed by:
4106 * [is_indirect * 8 + index_size_and_has_user_indices * 4 +
4107 * is_multi_draw * 2 + non_zero_draw_id]
4108 */
4109 static pipe_draw_func draw_funcs[16] = {
4110 tc_draw_single,
4111 tc_draw_single_draw_id,
4112 tc_draw_multi,
4113 tc_draw_multi,
4114 tc_draw_user_indices_single,
4115 tc_draw_user_indices_single_draw_id,
4116 tc_draw_user_indices_multi,
4117 tc_draw_user_indices_multi,
4118 tc_draw_indirect,
4119 tc_draw_indirect,
4120 tc_draw_indirect,
4121 tc_draw_indirect,
4122 tc_draw_indirect,
4123 tc_draw_indirect,
4124 tc_draw_indirect,
4125 tc_draw_indirect,
4126 };
4127
4128 void
tc_draw_vbo(struct pipe_context * _pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4129 tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
4130 unsigned drawid_offset,
4131 const struct pipe_draw_indirect_info *indirect,
4132 const struct pipe_draw_start_count_bias *draws,
4133 unsigned num_draws)
4134 {
4135 STATIC_ASSERT(DRAW_INFO_SIZE_WITHOUT_INDEXBUF_AND_MIN_MAX_INDEX +
4136 sizeof(intptr_t) == offsetof(struct pipe_draw_info, min_index));
4137
4138 struct threaded_context *tc = threaded_context(_pipe);
4139 if (tc->options.parse_renderpass_info)
4140 tc_parse_draw(tc);
4141
4142 /* Use a function table to call the desired variant of draw_vbo. */
4143 unsigned index = (indirect != NULL) * 8 +
4144 (info->index_size && info->has_user_indices) * 4 +
4145 (num_draws > 1) * 2 + (drawid_offset != 0);
4146 draw_funcs[index](_pipe, info, drawid_offset, indirect, draws, num_draws);
4147
4148 /* This must be after tc_add_*call, which can flush the batch. */
4149 if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
4150 tc_add_all_gfx_bindings_to_buffer_list(tc);
4151 }
4152
4153 struct tc_draw_single *
tc_add_draw_single_call(struct pipe_context * _pipe,struct pipe_resource * index_bo)4154 tc_add_draw_single_call(struct pipe_context *_pipe,
4155 struct pipe_resource *index_bo)
4156 {
4157 struct threaded_context *tc = threaded_context(_pipe);
4158
4159 if (tc->options.parse_renderpass_info)
4160 tc_parse_draw(tc);
4161
4162 struct tc_draw_single *p =
4163 tc_add_call(tc, TC_CALL_draw_single, tc_draw_single);
4164
4165 if (index_bo)
4166 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], index_bo);
4167
4168 /* This must be after tc_add_*call, which can flush the batch. */
4169 if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
4170 tc_add_all_gfx_bindings_to_buffer_list(tc);
4171
4172 return p;
4173 }
4174
4175 struct tc_draw_vstate_single {
4176 struct tc_call_base base;
4177 struct pipe_draw_start_count_bias draw;
4178
4179 /* The following states must be together without holes because they are
4180 * compared by draw merging.
4181 */
4182 struct pipe_vertex_state *state;
4183 uint32_t partial_velem_mask;
4184 struct pipe_draw_vertex_state_info info;
4185 };
4186
4187 static bool
is_next_call_a_mergeable_draw_vstate(struct tc_draw_vstate_single * first,struct tc_draw_vstate_single * next)4188 is_next_call_a_mergeable_draw_vstate(struct tc_draw_vstate_single *first,
4189 struct tc_draw_vstate_single *next)
4190 {
4191 if (next->base.call_id != TC_CALL_draw_vstate_single)
4192 return false;
4193
4194 return !memcmp(&first->state, &next->state,
4195 offsetof(struct tc_draw_vstate_single, info) +
4196 sizeof(struct pipe_draw_vertex_state_info) -
4197 offsetof(struct tc_draw_vstate_single, state));
4198 }
4199
4200 static uint16_t ALWAYS_INLINE
tc_call_draw_vstate_single(struct pipe_context * pipe,void * call)4201 tc_call_draw_vstate_single(struct pipe_context *pipe, void *call)
4202 {
4203 /* Draw call merging. */
4204 struct tc_draw_vstate_single *first = to_call(call, tc_draw_vstate_single);
4205 struct tc_draw_vstate_single *next = get_next_call(first, tc_draw_vstate_single);
4206
4207 /* If at least 2 consecutive draw calls can be merged... */
4208 if (is_next_call_a_mergeable_draw_vstate(first, next)) {
4209 /* The maximum number of merged draws is given by the batch size. */
4210 struct pipe_draw_start_count_bias draws[TC_SLOTS_PER_BATCH /
4211 call_size(tc_draw_vstate_single)];
4212 unsigned num_draws = 2;
4213
4214 draws[0] = first->draw;
4215 draws[1] = next->draw;
4216
4217 /* Find how many other draws can be merged. */
4218 next = get_next_call(next, tc_draw_vstate_single);
4219 for (; is_next_call_a_mergeable_draw_vstate(first, next);
4220 next = get_next_call(next, tc_draw_vstate_single),
4221 num_draws++)
4222 draws[num_draws] = next->draw;
4223
4224 pipe->draw_vertex_state(pipe, first->state, first->partial_velem_mask,
4225 first->info, draws, num_draws);
4226 /* Since all draws use the same state, drop all references at once. */
4227 tc_drop_vertex_state_references(first->state, num_draws);
4228
4229 return call_size(tc_draw_vstate_single) * num_draws;
4230 }
4231
4232 pipe->draw_vertex_state(pipe, first->state, first->partial_velem_mask,
4233 first->info, &first->draw, 1);
4234 tc_drop_vertex_state_references(first->state, 1);
4235 return call_size(tc_draw_vstate_single);
4236 }
4237
4238 struct tc_draw_vstate_multi {
4239 struct tc_call_base base;
4240 uint32_t partial_velem_mask;
4241 struct pipe_draw_vertex_state_info info;
4242 unsigned num_draws;
4243 struct pipe_vertex_state *state;
4244 struct pipe_draw_start_count_bias slot[0];
4245 };
4246
4247 static uint16_t ALWAYS_INLINE
tc_call_draw_vstate_multi(struct pipe_context * pipe,void * call)4248 tc_call_draw_vstate_multi(struct pipe_context *pipe, void *call)
4249 {
4250 struct tc_draw_vstate_multi *info = (struct tc_draw_vstate_multi*)call;
4251
4252 pipe->draw_vertex_state(pipe, info->state, info->partial_velem_mask,
4253 info->info, info->slot, info->num_draws);
4254 tc_drop_vertex_state_references(info->state, 1);
4255 return info->base.num_slots;
4256 }
4257
4258 static void
tc_draw_vertex_state(struct pipe_context * _pipe,struct pipe_vertex_state * state,uint32_t partial_velem_mask,struct pipe_draw_vertex_state_info info,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4259 tc_draw_vertex_state(struct pipe_context *_pipe,
4260 struct pipe_vertex_state *state,
4261 uint32_t partial_velem_mask,
4262 struct pipe_draw_vertex_state_info info,
4263 const struct pipe_draw_start_count_bias *draws,
4264 unsigned num_draws)
4265 {
4266 struct threaded_context *tc = threaded_context(_pipe);
4267 if (tc->options.parse_renderpass_info)
4268 tc_parse_draw(tc);
4269
4270 if (num_draws == 1) {
4271 /* Single draw. */
4272 struct tc_draw_vstate_single *p =
4273 tc_add_call(tc, TC_CALL_draw_vstate_single, tc_draw_vstate_single);
4274 p->partial_velem_mask = partial_velem_mask;
4275 p->draw = draws[0];
4276 p->info.mode = info.mode;
4277 p->info.take_vertex_state_ownership = false;
4278
4279 /* This should be always 0 for simplicity because we assume that
4280 * index_bias doesn't vary.
4281 */
4282 assert(draws[0].index_bias == 0);
4283
4284 if (!info.take_vertex_state_ownership)
4285 tc_set_vertex_state_reference(&p->state, state);
4286 else
4287 p->state = state;
4288
4289
4290 /* This must be after tc_add_*call, which can flush the batch. */
4291 if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
4292 tc_add_all_gfx_bindings_to_buffer_list(tc);
4293 return;
4294 }
4295
4296 const int draw_overhead_bytes = sizeof(struct tc_draw_vstate_multi);
4297 const int one_draw_slot_bytes = sizeof(((struct tc_draw_vstate_multi*)NULL)->slot[0]);
4298 const int slots_for_one_draw = DIV_ROUND_UP(draw_overhead_bytes + one_draw_slot_bytes,
4299 sizeof(struct tc_call_base));
4300 /* Multi draw. */
4301 int total_offset = 0;
4302 bool take_vertex_state_ownership = info.take_vertex_state_ownership;
4303 while (num_draws) {
4304 struct tc_batch *next = &tc->batch_slots[tc->next];
4305
4306 int nb_slots_left = TC_SLOTS_PER_BATCH - 1 - next->num_total_slots;
4307 /* If there isn't enough place for one draw, try to fill the next one */
4308 if (nb_slots_left < slots_for_one_draw)
4309 nb_slots_left = TC_SLOTS_PER_BATCH - 1;
4310 const int size_left_bytes = nb_slots_left * sizeof(struct tc_call_base);
4311
4312 /* How many draws can we fit in the current batch */
4313 const int dr = MIN2(num_draws, (size_left_bytes - draw_overhead_bytes) / one_draw_slot_bytes);
4314
4315 /* Non-indexed call or indexed with a real index buffer. */
4316 struct tc_draw_vstate_multi *p =
4317 tc_add_slot_based_call(tc, TC_CALL_draw_vstate_multi, tc_draw_vstate_multi, dr);
4318
4319 if (!take_vertex_state_ownership)
4320 tc_set_vertex_state_reference(&p->state, state);
4321 else
4322 p->state = state;
4323
4324 take_vertex_state_ownership = false;
4325 p->partial_velem_mask = partial_velem_mask;
4326 p->info.mode = info.mode;
4327 p->info.take_vertex_state_ownership = false;
4328 p->num_draws = dr;
4329 memcpy(p->slot, &draws[total_offset], sizeof(draws[0]) * dr);
4330 num_draws -= dr;
4331
4332 total_offset += dr;
4333 }
4334
4335
4336 /* This must be after tc_add_*call, which can flush the batch. */
4337 if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
4338 tc_add_all_gfx_bindings_to_buffer_list(tc);
4339 }
4340
4341 struct tc_launch_grid_call {
4342 struct tc_call_base base;
4343 struct pipe_grid_info info;
4344 };
4345
4346 static uint16_t ALWAYS_INLINE
tc_call_launch_grid(struct pipe_context * pipe,void * call)4347 tc_call_launch_grid(struct pipe_context *pipe, void *call)
4348 {
4349 struct pipe_grid_info *p = &to_call(call, tc_launch_grid_call)->info;
4350
4351 pipe->launch_grid(pipe, p);
4352 tc_drop_resource_reference(p->indirect);
4353 return call_size(tc_launch_grid_call);
4354 }
4355
4356 static void
tc_launch_grid(struct pipe_context * _pipe,const struct pipe_grid_info * info)4357 tc_launch_grid(struct pipe_context *_pipe,
4358 const struct pipe_grid_info *info)
4359 {
4360 struct threaded_context *tc = threaded_context(_pipe);
4361 struct tc_launch_grid_call *p = tc_add_call(tc, TC_CALL_launch_grid,
4362 tc_launch_grid_call);
4363 assert(info->input == NULL);
4364
4365 tc_set_resource_reference(&p->info.indirect, info->indirect);
4366 memcpy(&p->info, info, sizeof(*info));
4367
4368 if (info->indirect)
4369 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], info->indirect);
4370
4371 /* This must be after tc_add_*call, which can flush the batch. */
4372 if (unlikely(tc->add_all_compute_bindings_to_buffer_list))
4373 tc_add_all_compute_bindings_to_buffer_list(tc);
4374 }
4375
4376 static uint16_t ALWAYS_INLINE
tc_call_resource_copy_region(struct pipe_context * pipe,void * call)4377 tc_call_resource_copy_region(struct pipe_context *pipe, void *call)
4378 {
4379 struct tc_resource_copy_region *p = to_call(call, tc_resource_copy_region);
4380
4381 pipe->resource_copy_region(pipe, p->dst, p->dst_level, p->dstx, p->dsty,
4382 p->dstz, p->src, p->src_level, &p->src_box);
4383 tc_drop_resource_reference(p->dst);
4384 tc_drop_resource_reference(p->src);
4385 return call_size(tc_resource_copy_region);
4386 }
4387
4388 static void
tc_resource_copy_region(struct pipe_context * _pipe,struct pipe_resource * dst,unsigned dst_level,unsigned dstx,unsigned dsty,unsigned dstz,struct pipe_resource * src,unsigned src_level,const struct pipe_box * src_box)4389 tc_resource_copy_region(struct pipe_context *_pipe,
4390 struct pipe_resource *dst, unsigned dst_level,
4391 unsigned dstx, unsigned dsty, unsigned dstz,
4392 struct pipe_resource *src, unsigned src_level,
4393 const struct pipe_box *src_box)
4394 {
4395 struct threaded_context *tc = threaded_context(_pipe);
4396 struct threaded_resource *tdst = threaded_resource(dst);
4397 struct tc_resource_copy_region *p =
4398 tc_add_call(tc, TC_CALL_resource_copy_region,
4399 tc_resource_copy_region);
4400
4401 if (dst->target == PIPE_BUFFER)
4402 tc_buffer_disable_cpu_storage(dst);
4403
4404 tc_set_resource_batch_usage(tc, dst);
4405 tc_set_resource_reference(&p->dst, dst);
4406 p->dst_level = dst_level;
4407 p->dstx = dstx;
4408 p->dsty = dsty;
4409 p->dstz = dstz;
4410 tc_set_resource_batch_usage(tc, src);
4411 tc_set_resource_reference(&p->src, src);
4412 p->src_level = src_level;
4413 p->src_box = *src_box;
4414
4415 if (dst->target == PIPE_BUFFER) {
4416 struct tc_buffer_list *next = &tc->buffer_lists[tc->next_buf_list];
4417
4418 tc_add_to_buffer_list(next, src);
4419 tc_add_to_buffer_list(next, dst);
4420
4421 util_range_add(&tdst->b, &tdst->valid_buffer_range,
4422 dstx, dstx + src_box->width);
4423 }
4424 }
4425
4426 struct tc_blit_call {
4427 struct tc_call_base base;
4428 struct pipe_blit_info info;
4429 };
4430
4431 static uint16_t ALWAYS_INLINE
tc_call_blit(struct pipe_context * pipe,void * call)4432 tc_call_blit(struct pipe_context *pipe, void *call)
4433 {
4434 struct pipe_blit_info *blit = &to_call(call, tc_blit_call)->info;
4435
4436 pipe->blit(pipe, blit);
4437 tc_drop_resource_reference(blit->dst.resource);
4438 tc_drop_resource_reference(blit->src.resource);
4439 return call_size(tc_blit_call);
4440 }
4441
4442 static void
tc_blit_enqueue(struct threaded_context * tc,const struct pipe_blit_info * info)4443 tc_blit_enqueue(struct threaded_context *tc, const struct pipe_blit_info *info)
4444 {
4445 struct tc_blit_call *blit = tc_add_call(tc, TC_CALL_blit, tc_blit_call);
4446
4447 tc_set_resource_batch_usage(tc, info->dst.resource);
4448 tc_set_resource_reference(&blit->info.dst.resource, info->dst.resource);
4449 tc_set_resource_batch_usage(tc, info->src.resource);
4450 tc_set_resource_reference(&blit->info.src.resource, info->src.resource);
4451 memcpy(&blit->info, info, sizeof(*info));
4452 }
4453
4454 static void
tc_blit(struct pipe_context * _pipe,const struct pipe_blit_info * info)4455 tc_blit(struct pipe_context *_pipe, const struct pipe_blit_info *info)
4456 {
4457 struct threaded_context *tc = threaded_context(_pipe);
4458
4459 /* filter out untracked non-resolves */
4460 if (!tc->options.parse_renderpass_info ||
4461 info->src.resource->nr_samples <= 1 ||
4462 info->dst.resource->nr_samples > 1) {
4463 tc_blit_enqueue(tc, info);
4464 return;
4465 }
4466
4467 if (tc->fb_resolve == info->dst.resource) {
4468 /* optimize out this blit entirely */
4469 tc->renderpass_info_recording->has_resolve = true;
4470 return;
4471 }
4472 for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
4473 if (tc->fb_resources[i] == info->src.resource) {
4474 tc->renderpass_info_recording->has_resolve = true;
4475 break;
4476 }
4477 }
4478 tc_blit_enqueue(tc, info);
4479 }
4480
4481 struct tc_generate_mipmap {
4482 struct tc_call_base base;
4483 enum pipe_format format;
4484 unsigned base_level;
4485 unsigned last_level;
4486 unsigned first_layer;
4487 unsigned last_layer;
4488 struct pipe_resource *res;
4489 };
4490
4491 static uint16_t ALWAYS_INLINE
tc_call_generate_mipmap(struct pipe_context * pipe,void * call)4492 tc_call_generate_mipmap(struct pipe_context *pipe, void *call)
4493 {
4494 struct tc_generate_mipmap *p = to_call(call, tc_generate_mipmap);
4495 ASSERTED bool result = pipe->generate_mipmap(pipe, p->res, p->format,
4496 p->base_level,
4497 p->last_level,
4498 p->first_layer,
4499 p->last_layer);
4500 assert(result);
4501 tc_drop_resource_reference(p->res);
4502 return call_size(tc_generate_mipmap);
4503 }
4504
4505 static bool
tc_generate_mipmap(struct pipe_context * _pipe,struct pipe_resource * res,enum pipe_format format,unsigned base_level,unsigned last_level,unsigned first_layer,unsigned last_layer)4506 tc_generate_mipmap(struct pipe_context *_pipe,
4507 struct pipe_resource *res,
4508 enum pipe_format format,
4509 unsigned base_level,
4510 unsigned last_level,
4511 unsigned first_layer,
4512 unsigned last_layer)
4513 {
4514 struct threaded_context *tc = threaded_context(_pipe);
4515 struct pipe_context *pipe = tc->pipe;
4516 struct pipe_screen *screen = pipe->screen;
4517 unsigned bind = PIPE_BIND_SAMPLER_VIEW;
4518
4519 if (util_format_is_depth_or_stencil(format))
4520 bind = PIPE_BIND_DEPTH_STENCIL;
4521 else
4522 bind = PIPE_BIND_RENDER_TARGET;
4523
4524 if (!screen->is_format_supported(screen, format, res->target,
4525 res->nr_samples, res->nr_storage_samples,
4526 bind))
4527 return false;
4528
4529 struct tc_generate_mipmap *p =
4530 tc_add_call(tc, TC_CALL_generate_mipmap, tc_generate_mipmap);
4531
4532 tc_set_resource_batch_usage(tc, res);
4533 tc_set_resource_reference(&p->res, res);
4534 p->format = format;
4535 p->base_level = base_level;
4536 p->last_level = last_level;
4537 p->first_layer = first_layer;
4538 p->last_layer = last_layer;
4539 return true;
4540 }
4541
4542 struct tc_resource_call {
4543 struct tc_call_base base;
4544 struct pipe_resource *resource;
4545 };
4546
4547 static uint16_t ALWAYS_INLINE
tc_call_flush_resource(struct pipe_context * pipe,void * call)4548 tc_call_flush_resource(struct pipe_context *pipe, void *call)
4549 {
4550 struct pipe_resource *resource = to_call(call, tc_resource_call)->resource;
4551
4552 pipe->flush_resource(pipe, resource);
4553 tc_drop_resource_reference(resource);
4554 return call_size(tc_resource_call);
4555 }
4556
4557 static void
tc_flush_resource(struct pipe_context * _pipe,struct pipe_resource * resource)4558 tc_flush_resource(struct pipe_context *_pipe, struct pipe_resource *resource)
4559 {
4560 struct threaded_context *tc = threaded_context(_pipe);
4561 struct tc_resource_call *call = tc_add_call(tc, TC_CALL_flush_resource,
4562 tc_resource_call);
4563
4564 tc_set_resource_batch_usage(tc, resource);
4565 tc_set_resource_reference(&call->resource, resource);
4566 }
4567
4568 static uint16_t ALWAYS_INLINE
tc_call_invalidate_resource(struct pipe_context * pipe,void * call)4569 tc_call_invalidate_resource(struct pipe_context *pipe, void *call)
4570 {
4571 struct pipe_resource *resource = to_call(call, tc_resource_call)->resource;
4572
4573 pipe->invalidate_resource(pipe, resource);
4574 tc_drop_resource_reference(resource);
4575 return call_size(tc_resource_call);
4576 }
4577
4578 static void
tc_invalidate_resource(struct pipe_context * _pipe,struct pipe_resource * resource)4579 tc_invalidate_resource(struct pipe_context *_pipe,
4580 struct pipe_resource *resource)
4581 {
4582 struct threaded_context *tc = threaded_context(_pipe);
4583
4584 if (resource->target == PIPE_BUFFER) {
4585 tc_invalidate_buffer(tc, threaded_resource(resource));
4586 return;
4587 }
4588
4589 struct tc_resource_call *call = tc_add_call(tc, TC_CALL_invalidate_resource,
4590 tc_resource_call);
4591 tc_set_resource_batch_usage(tc, resource);
4592 tc_set_resource_reference(&call->resource, resource);
4593
4594 struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
4595 if (info) {
4596 if (tc->fb_resources[PIPE_MAX_COLOR_BUFS] == resource) {
4597 info->zsbuf_invalidate = true;
4598 } else {
4599 for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
4600 if (tc->fb_resources[i] == resource)
4601 info->cbuf_invalidate |= BITFIELD_BIT(i);
4602 }
4603 }
4604 }
4605 }
4606
4607 struct tc_clear {
4608 struct tc_call_base base;
4609 bool scissor_state_set;
4610 uint8_t stencil;
4611 uint16_t buffers;
4612 float depth;
4613 struct pipe_scissor_state scissor_state;
4614 union pipe_color_union color;
4615 };
4616
4617 static uint16_t ALWAYS_INLINE
tc_call_clear(struct pipe_context * pipe,void * call)4618 tc_call_clear(struct pipe_context *pipe, void *call)
4619 {
4620 struct tc_clear *p = to_call(call, tc_clear);
4621
4622 pipe->clear(pipe, p->buffers, p->scissor_state_set ? &p->scissor_state : NULL, &p->color, p->depth, p->stencil);
4623 return call_size(tc_clear);
4624 }
4625
4626 static void
tc_clear(struct pipe_context * _pipe,unsigned buffers,const struct pipe_scissor_state * scissor_state,const union pipe_color_union * color,double depth,unsigned stencil)4627 tc_clear(struct pipe_context *_pipe, unsigned buffers, const struct pipe_scissor_state *scissor_state,
4628 const union pipe_color_union *color, double depth,
4629 unsigned stencil)
4630 {
4631 struct threaded_context *tc = threaded_context(_pipe);
4632 struct tc_clear *p = tc_add_call(tc, TC_CALL_clear, tc_clear);
4633
4634 p->buffers = buffers;
4635 if (scissor_state) {
4636 p->scissor_state = *scissor_state;
4637 struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
4638 /* partial clear info is useful for drivers to know whether any zs writes occur;
4639 * drivers are responsible for optimizing partial clear -> full clear
4640 */
4641 if (info && buffers & PIPE_CLEAR_DEPTHSTENCIL)
4642 info->zsbuf_clear_partial |= !info->zsbuf_clear;
4643 } else {
4644 struct tc_renderpass_info *info = tc_get_renderpass_info(tc);
4645 if (info) {
4646 /* full clears use a different load operation, but are only valid if draws haven't occurred yet */
4647 info->cbuf_clear |= (buffers >> 2) & ~info->cbuf_load;
4648 if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
4649 if (!info->zsbuf_load && !info->zsbuf_clear_partial)
4650 info->zsbuf_clear = true;
4651 else if (!info->zsbuf_clear)
4652 /* this is a clear that occurred after a draw: flag as partial to ensure it isn't ignored */
4653 info->zsbuf_clear_partial = true;
4654 }
4655 }
4656 }
4657 p->scissor_state_set = !!scissor_state;
4658 p->color = *color;
4659 p->depth = depth;
4660 p->stencil = stencil;
4661 }
4662
4663 struct tc_clear_render_target {
4664 struct tc_call_base base;
4665 bool render_condition_enabled;
4666 unsigned dstx;
4667 unsigned dsty;
4668 unsigned width;
4669 unsigned height;
4670 union pipe_color_union color;
4671 struct pipe_surface *dst;
4672 };
4673
4674 static uint16_t ALWAYS_INLINE
tc_call_clear_render_target(struct pipe_context * pipe,void * call)4675 tc_call_clear_render_target(struct pipe_context *pipe, void *call)
4676 {
4677 struct tc_clear_render_target *p = to_call(call, tc_clear_render_target);
4678
4679 pipe->clear_render_target(pipe, p->dst, &p->color, p->dstx, p->dsty, p->width, p->height,
4680 p->render_condition_enabled);
4681 tc_drop_surface_reference(p->dst);
4682 return call_size(tc_clear_render_target);
4683 }
4684
4685 static void
tc_clear_render_target(struct pipe_context * _pipe,struct pipe_surface * dst,const union pipe_color_union * color,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)4686 tc_clear_render_target(struct pipe_context *_pipe,
4687 struct pipe_surface *dst,
4688 const union pipe_color_union *color,
4689 unsigned dstx, unsigned dsty,
4690 unsigned width, unsigned height,
4691 bool render_condition_enabled)
4692 {
4693 struct threaded_context *tc = threaded_context(_pipe);
4694 struct tc_clear_render_target *p = tc_add_call(tc, TC_CALL_clear_render_target, tc_clear_render_target);
4695 p->dst = NULL;
4696 pipe_surface_reference(&p->dst, dst);
4697 p->color = *color;
4698 p->dstx = dstx;
4699 p->dsty = dsty;
4700 p->width = width;
4701 p->height = height;
4702 p->render_condition_enabled = render_condition_enabled;
4703 }
4704
4705
4706 struct tc_clear_depth_stencil {
4707 struct tc_call_base base;
4708 bool render_condition_enabled;
4709 float depth;
4710 unsigned clear_flags;
4711 unsigned stencil;
4712 unsigned dstx;
4713 unsigned dsty;
4714 unsigned width;
4715 unsigned height;
4716 struct pipe_surface *dst;
4717 };
4718
4719
4720 static uint16_t ALWAYS_INLINE
tc_call_clear_depth_stencil(struct pipe_context * pipe,void * call)4721 tc_call_clear_depth_stencil(struct pipe_context *pipe, void *call)
4722 {
4723 struct tc_clear_depth_stencil *p = to_call(call, tc_clear_depth_stencil);
4724
4725 pipe->clear_depth_stencil(pipe, p->dst, p->clear_flags, p->depth, p->stencil,
4726 p->dstx, p->dsty, p->width, p->height,
4727 p->render_condition_enabled);
4728 tc_drop_surface_reference(p->dst);
4729 return call_size(tc_clear_depth_stencil);
4730 }
4731
4732 static void
tc_clear_depth_stencil(struct pipe_context * _pipe,struct pipe_surface * dst,unsigned clear_flags,double depth,unsigned stencil,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)4733 tc_clear_depth_stencil(struct pipe_context *_pipe,
4734 struct pipe_surface *dst, unsigned clear_flags,
4735 double depth, unsigned stencil, unsigned dstx,
4736 unsigned dsty, unsigned width, unsigned height,
4737 bool render_condition_enabled)
4738 {
4739 struct threaded_context *tc = threaded_context(_pipe);
4740 struct tc_clear_depth_stencil *p = tc_add_call(tc, TC_CALL_clear_depth_stencil, tc_clear_depth_stencil);
4741 p->dst = NULL;
4742 pipe_surface_reference(&p->dst, dst);
4743 p->clear_flags = clear_flags;
4744 p->depth = depth;
4745 p->stencil = stencil;
4746 p->dstx = dstx;
4747 p->dsty = dsty;
4748 p->width = width;
4749 p->height = height;
4750 p->render_condition_enabled = render_condition_enabled;
4751 }
4752
4753 struct tc_clear_buffer {
4754 struct tc_call_base base;
4755 uint8_t clear_value_size;
4756 unsigned offset;
4757 unsigned size;
4758 char clear_value[16];
4759 struct pipe_resource *res;
4760 };
4761
4762 static uint16_t ALWAYS_INLINE
tc_call_clear_buffer(struct pipe_context * pipe,void * call)4763 tc_call_clear_buffer(struct pipe_context *pipe, void *call)
4764 {
4765 struct tc_clear_buffer *p = to_call(call, tc_clear_buffer);
4766
4767 pipe->clear_buffer(pipe, p->res, p->offset, p->size, p->clear_value,
4768 p->clear_value_size);
4769 tc_drop_resource_reference(p->res);
4770 return call_size(tc_clear_buffer);
4771 }
4772
4773 static void
tc_clear_buffer(struct pipe_context * _pipe,struct pipe_resource * res,unsigned offset,unsigned size,const void * clear_value,int clear_value_size)4774 tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res,
4775 unsigned offset, unsigned size,
4776 const void *clear_value, int clear_value_size)
4777 {
4778 struct threaded_context *tc = threaded_context(_pipe);
4779 struct threaded_resource *tres = threaded_resource(res);
4780 struct tc_clear_buffer *p =
4781 tc_add_call(tc, TC_CALL_clear_buffer, tc_clear_buffer);
4782
4783 tc_buffer_disable_cpu_storage(res);
4784
4785 tc_set_resource_reference(&p->res, res);
4786 tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], res);
4787 p->offset = offset;
4788 p->size = size;
4789 memcpy(p->clear_value, clear_value, clear_value_size);
4790 p->clear_value_size = clear_value_size;
4791
4792 util_range_add(&tres->b, &tres->valid_buffer_range, offset, offset + size);
4793 }
4794
4795 struct tc_clear_texture {
4796 struct tc_call_base base;
4797 unsigned level;
4798 struct pipe_box box;
4799 char data[16];
4800 struct pipe_resource *res;
4801 };
4802
4803 static uint16_t ALWAYS_INLINE
tc_call_clear_texture(struct pipe_context * pipe,void * call)4804 tc_call_clear_texture(struct pipe_context *pipe, void *call)
4805 {
4806 struct tc_clear_texture *p = to_call(call, tc_clear_texture);
4807
4808 pipe->clear_texture(pipe, p->res, p->level, &p->box, p->data);
4809 tc_drop_resource_reference(p->res);
4810 return call_size(tc_clear_texture);
4811 }
4812
4813 static void
tc_clear_texture(struct pipe_context * _pipe,struct pipe_resource * res,unsigned level,const struct pipe_box * box,const void * data)4814 tc_clear_texture(struct pipe_context *_pipe, struct pipe_resource *res,
4815 unsigned level, const struct pipe_box *box, const void *data)
4816 {
4817 struct threaded_context *tc = threaded_context(_pipe);
4818 struct tc_clear_texture *p =
4819 tc_add_call(tc, TC_CALL_clear_texture, tc_clear_texture);
4820
4821 tc_set_resource_batch_usage(tc, res);
4822 tc_set_resource_reference(&p->res, res);
4823 p->level = level;
4824 p->box = *box;
4825 memcpy(p->data, data,
4826 util_format_get_blocksize(res->format));
4827 }
4828
4829 struct tc_resource_commit {
4830 struct tc_call_base base;
4831 bool commit;
4832 unsigned level;
4833 struct pipe_box box;
4834 struct pipe_resource *res;
4835 };
4836
4837 static uint16_t ALWAYS_INLINE
tc_call_resource_commit(struct pipe_context * pipe,void * call)4838 tc_call_resource_commit(struct pipe_context *pipe, void *call)
4839 {
4840 struct tc_resource_commit *p = to_call(call, tc_resource_commit);
4841
4842 pipe->resource_commit(pipe, p->res, p->level, &p->box, p->commit);
4843 tc_drop_resource_reference(p->res);
4844 return call_size(tc_resource_commit);
4845 }
4846
4847 static bool
tc_resource_commit(struct pipe_context * _pipe,struct pipe_resource * res,unsigned level,struct pipe_box * box,bool commit)4848 tc_resource_commit(struct pipe_context *_pipe, struct pipe_resource *res,
4849 unsigned level, struct pipe_box *box, bool commit)
4850 {
4851 struct threaded_context *tc = threaded_context(_pipe);
4852 struct tc_resource_commit *p =
4853 tc_add_call(tc, TC_CALL_resource_commit, tc_resource_commit);
4854
4855 tc_set_resource_reference(&p->res, res);
4856 tc_set_resource_batch_usage(tc, res);
4857 p->level = level;
4858 p->box = *box;
4859 p->commit = commit;
4860 return true; /* we don't care about the return value for this call */
4861 }
4862
4863 static unsigned
tc_init_intel_perf_query_info(struct pipe_context * _pipe)4864 tc_init_intel_perf_query_info(struct pipe_context *_pipe)
4865 {
4866 struct threaded_context *tc = threaded_context(_pipe);
4867 struct pipe_context *pipe = tc->pipe;
4868
4869 return pipe->init_intel_perf_query_info(pipe);
4870 }
4871
4872 static void
tc_get_intel_perf_query_info(struct pipe_context * _pipe,unsigned query_index,const char ** name,uint32_t * data_size,uint32_t * n_counters,uint32_t * n_active)4873 tc_get_intel_perf_query_info(struct pipe_context *_pipe,
4874 unsigned query_index,
4875 const char **name,
4876 uint32_t *data_size,
4877 uint32_t *n_counters,
4878 uint32_t *n_active)
4879 {
4880 struct threaded_context *tc = threaded_context(_pipe);
4881 struct pipe_context *pipe = tc->pipe;
4882
4883 tc_sync(tc); /* n_active vs begin/end_intel_perf_query */
4884 pipe->get_intel_perf_query_info(pipe, query_index, name, data_size,
4885 n_counters, n_active);
4886 }
4887
4888 static void
tc_get_intel_perf_query_counter_info(struct pipe_context * _pipe,unsigned query_index,unsigned counter_index,const char ** name,const char ** desc,uint32_t * offset,uint32_t * data_size,uint32_t * type_enum,uint32_t * data_type_enum,uint64_t * raw_max)4889 tc_get_intel_perf_query_counter_info(struct pipe_context *_pipe,
4890 unsigned query_index,
4891 unsigned counter_index,
4892 const char **name,
4893 const char **desc,
4894 uint32_t *offset,
4895 uint32_t *data_size,
4896 uint32_t *type_enum,
4897 uint32_t *data_type_enum,
4898 uint64_t *raw_max)
4899 {
4900 struct threaded_context *tc = threaded_context(_pipe);
4901 struct pipe_context *pipe = tc->pipe;
4902
4903 pipe->get_intel_perf_query_counter_info(pipe, query_index, counter_index,
4904 name, desc, offset, data_size, type_enum, data_type_enum, raw_max);
4905 }
4906
4907 static struct pipe_query *
tc_new_intel_perf_query_obj(struct pipe_context * _pipe,unsigned query_index)4908 tc_new_intel_perf_query_obj(struct pipe_context *_pipe, unsigned query_index)
4909 {
4910 struct threaded_context *tc = threaded_context(_pipe);
4911 struct pipe_context *pipe = tc->pipe;
4912
4913 return pipe->new_intel_perf_query_obj(pipe, query_index);
4914 }
4915
4916 static uint16_t ALWAYS_INLINE
tc_call_begin_intel_perf_query(struct pipe_context * pipe,void * call)4917 tc_call_begin_intel_perf_query(struct pipe_context *pipe, void *call)
4918 {
4919 (void)pipe->begin_intel_perf_query(pipe, to_call(call, tc_query_call)->query);
4920 return call_size(tc_query_call);
4921 }
4922
4923 static bool
tc_begin_intel_perf_query(struct pipe_context * _pipe,struct pipe_query * q)4924 tc_begin_intel_perf_query(struct pipe_context *_pipe, struct pipe_query *q)
4925 {
4926 struct threaded_context *tc = threaded_context(_pipe);
4927
4928 tc_add_call(tc, TC_CALL_begin_intel_perf_query, tc_query_call)->query = q;
4929
4930 /* assume success, begin failure can be signaled from get_intel_perf_query_data */
4931 return true;
4932 }
4933
4934 static uint16_t ALWAYS_INLINE
tc_call_end_intel_perf_query(struct pipe_context * pipe,void * call)4935 tc_call_end_intel_perf_query(struct pipe_context *pipe, void *call)
4936 {
4937 pipe->end_intel_perf_query(pipe, to_call(call, tc_query_call)->query);
4938 return call_size(tc_query_call);
4939 }
4940
4941 static void
tc_end_intel_perf_query(struct pipe_context * _pipe,struct pipe_query * q)4942 tc_end_intel_perf_query(struct pipe_context *_pipe, struct pipe_query *q)
4943 {
4944 struct threaded_context *tc = threaded_context(_pipe);
4945
4946 tc_add_call(tc, TC_CALL_end_intel_perf_query, tc_query_call)->query = q;
4947 }
4948
4949 static void
tc_delete_intel_perf_query(struct pipe_context * _pipe,struct pipe_query * q)4950 tc_delete_intel_perf_query(struct pipe_context *_pipe, struct pipe_query *q)
4951 {
4952 struct threaded_context *tc = threaded_context(_pipe);
4953 struct pipe_context *pipe = tc->pipe;
4954
4955 tc_sync(tc); /* flush potentially pending begin/end_intel_perf_queries */
4956 pipe->delete_intel_perf_query(pipe, q);
4957 }
4958
4959 static void
tc_wait_intel_perf_query(struct pipe_context * _pipe,struct pipe_query * q)4960 tc_wait_intel_perf_query(struct pipe_context *_pipe, struct pipe_query *q)
4961 {
4962 struct threaded_context *tc = threaded_context(_pipe);
4963 struct pipe_context *pipe = tc->pipe;
4964
4965 tc_sync(tc); /* flush potentially pending begin/end_intel_perf_queries */
4966 pipe->wait_intel_perf_query(pipe, q);
4967 }
4968
4969 static bool
tc_is_intel_perf_query_ready(struct pipe_context * _pipe,struct pipe_query * q)4970 tc_is_intel_perf_query_ready(struct pipe_context *_pipe, struct pipe_query *q)
4971 {
4972 struct threaded_context *tc = threaded_context(_pipe);
4973 struct pipe_context *pipe = tc->pipe;
4974
4975 tc_sync(tc); /* flush potentially pending begin/end_intel_perf_queries */
4976 return pipe->is_intel_perf_query_ready(pipe, q);
4977 }
4978
4979 static bool
tc_get_intel_perf_query_data(struct pipe_context * _pipe,struct pipe_query * q,size_t data_size,uint32_t * data,uint32_t * bytes_written)4980 tc_get_intel_perf_query_data(struct pipe_context *_pipe,
4981 struct pipe_query *q,
4982 size_t data_size,
4983 uint32_t *data,
4984 uint32_t *bytes_written)
4985 {
4986 struct threaded_context *tc = threaded_context(_pipe);
4987 struct pipe_context *pipe = tc->pipe;
4988
4989 tc_sync(tc); /* flush potentially pending begin/end_intel_perf_queries */
4990 return pipe->get_intel_perf_query_data(pipe, q, data_size, data, bytes_written);
4991 }
4992
4993 /********************************************************************
4994 * callback
4995 */
4996
4997 struct tc_callback_call {
4998 struct tc_call_base base;
4999 void (*fn)(void *data);
5000 void *data;
5001 };
5002
5003 static uint16_t ALWAYS_INLINE
tc_call_callback(UNUSED struct pipe_context * pipe,void * call)5004 tc_call_callback(UNUSED struct pipe_context *pipe, void *call)
5005 {
5006 struct tc_callback_call *p = to_call(call, tc_callback_call);
5007
5008 p->fn(p->data);
5009 return call_size(tc_callback_call);
5010 }
5011
5012 static void
tc_callback(struct pipe_context * _pipe,void (* fn)(void *),void * data,bool asap)5013 tc_callback(struct pipe_context *_pipe, void (*fn)(void *), void *data,
5014 bool asap)
5015 {
5016 struct threaded_context *tc = threaded_context(_pipe);
5017
5018 if (asap && tc_is_sync(tc)) {
5019 fn(data);
5020 return;
5021 }
5022
5023 struct tc_callback_call *p =
5024 tc_add_call(tc, TC_CALL_callback, tc_callback_call);
5025 p->fn = fn;
5026 p->data = data;
5027 }
5028
5029 /********************************************************************
5030 * batch execution in the driver thread
5031 */
5032
5033 typedef uint16_t (*tc_execute)(struct pipe_context *pipe, void *call);
5034
5035 ALWAYS_INLINE static void
batch_execute(struct tc_batch * batch,struct pipe_context * pipe,bool parsing)5036 batch_execute(struct tc_batch *batch, struct pipe_context *pipe, bool parsing)
5037 {
5038 /* if the framebuffer state is persisting from a previous batch,
5039 * begin incrementing renderpass info on the first set_framebuffer_state call
5040 */
5041 bool first = !batch->first_set_fb;
5042 uint64_t *iter = batch->slots;
5043
5044 while (1) {
5045 struct tc_call_base *call = (struct tc_call_base *)iter;
5046
5047 tc_assert(call->sentinel == TC_SENTINEL);
5048 #if TC_DEBUG >= 3
5049 tc_printf("CALL: %s", tc_call_names[call->call_id]);
5050 #endif
5051 TC_TRACE_SCOPE(call->call_id);
5052
5053 /* This executes the call using a switch. */
5054 switch (call->call_id) {
5055 #define CALL(name) \
5056 case TC_CALL_##name: \
5057 iter += tc_call_##name(pipe, call); \
5058 break;
5059 #include "u_threaded_context_calls.h"
5060 #undef CALL
5061 case TC_END_BATCH:
5062 return;
5063 }
5064
5065 if (parsing) {
5066 if (call->call_id == TC_CALL_flush) {
5067 /* always increment renderpass info for non-deferred flushes */
5068 batch->tc->renderpass_info = incr_rp_info(batch->tc->renderpass_info);
5069 /* if a flush happens, renderpass info is always incremented after */
5070 first = false;
5071 } else if (call->call_id == TC_CALL_set_framebuffer_state) {
5072 /* the renderpass info pointer is already set at the start of the batch,
5073 * so don't increment on the first set_framebuffer_state call
5074 */
5075 if (!first)
5076 batch->tc->renderpass_info = incr_rp_info(batch->tc->renderpass_info);
5077 first = false;
5078 } else if (call->call_id == TC_CALL_draw_single ||
5079 call->call_id == TC_CALL_draw_multi ||
5080 (call->call_id >= TC_CALL_draw_single_drawid &&
5081 call->call_id <= TC_CALL_draw_vstate_multi)) {
5082 /* if a draw happens before a set_framebuffer_state on this batch,
5083 * begin incrementing renderpass data
5084 */
5085 first = false;
5086 }
5087 }
5088 }
5089 }
5090
5091 static void
tc_batch_execute(void * job,UNUSED void * gdata,int thread_index)5092 tc_batch_execute(void *job, UNUSED void *gdata, int thread_index)
5093 {
5094 struct tc_batch *batch = job;
5095 struct pipe_context *pipe = batch->tc->pipe;
5096
5097 tc_batch_check(batch);
5098 tc_set_driver_thread(batch->tc);
5099
5100 assert(!batch->token);
5101
5102 /* setup renderpass info */
5103 batch->tc->renderpass_info = batch->renderpass_infos.data;
5104
5105 if (batch->tc->options.parse_renderpass_info) {
5106 batch_execute(batch, pipe, true);
5107
5108 struct tc_batch_rp_info *info = batch->renderpass_infos.data;
5109 for (unsigned i = 0; i < batch->max_renderpass_info_idx + 1; i++) {
5110 if (info[i].next)
5111 info[i].next->prev = NULL;
5112 info[i].next = NULL;
5113 }
5114 } else {
5115 batch_execute(batch, pipe, false);
5116 }
5117
5118 /* Add the fence to the list of fences for the driver to signal at the next
5119 * flush, which we use for tracking which buffers are referenced by
5120 * an unflushed command buffer.
5121 */
5122 struct threaded_context *tc = batch->tc;
5123 struct util_queue_fence *fence =
5124 &tc->buffer_lists[batch->buffer_list_index].driver_flushed_fence;
5125
5126 if (tc->options.driver_calls_flush_notify) {
5127 tc->signal_fences_next_flush[tc->num_signal_fences_next_flush++] = fence;
5128
5129 /* Since our buffer lists are chained as a ring, we need to flush
5130 * the context twice as we go around the ring to make the driver signal
5131 * the buffer list fences, so that the producer thread can reuse the buffer
5132 * list structures for the next batches without waiting.
5133 */
5134 unsigned half_ring = TC_MAX_BUFFER_LISTS / 2;
5135 if (batch->buffer_list_index % half_ring == half_ring - 1)
5136 pipe->flush(pipe, NULL, PIPE_FLUSH_ASYNC);
5137 } else {
5138 util_queue_fence_signal(fence);
5139 }
5140
5141 tc_clear_driver_thread(batch->tc);
5142 tc_batch_check(batch);
5143 batch->num_total_slots = 0;
5144 batch->last_mergeable_call = NULL;
5145 batch->first_set_fb = false;
5146 batch->max_renderpass_info_idx = 0;
5147 batch->tc->last_completed = batch->batch_idx;
5148 }
5149
5150 /********************************************************************
5151 * create & destroy
5152 */
5153
5154 static void
tc_destroy(struct pipe_context * _pipe)5155 tc_destroy(struct pipe_context *_pipe)
5156 {
5157 struct threaded_context *tc = threaded_context(_pipe);
5158 struct pipe_context *pipe = tc->pipe;
5159
5160 if (tc->base.const_uploader &&
5161 tc->base.stream_uploader != tc->base.const_uploader)
5162 u_upload_destroy(tc->base.const_uploader);
5163
5164 if (tc->base.stream_uploader)
5165 u_upload_destroy(tc->base.stream_uploader);
5166
5167 tc_sync(tc);
5168
5169 if (util_queue_is_initialized(&tc->queue)) {
5170 util_queue_destroy(&tc->queue);
5171
5172 for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
5173 util_queue_fence_destroy(&tc->batch_slots[i].fence);
5174 util_dynarray_fini(&tc->batch_slots[i].renderpass_infos);
5175 assert(!tc->batch_slots[i].token);
5176 }
5177 }
5178
5179 slab_destroy_child(&tc->pool_transfers);
5180 assert(tc->batch_slots[tc->next].num_total_slots == 0);
5181 pipe->destroy(pipe);
5182
5183 for (unsigned i = 0; i < TC_MAX_BUFFER_LISTS; i++) {
5184 if (!util_queue_fence_is_signalled(&tc->buffer_lists[i].driver_flushed_fence))
5185 util_queue_fence_signal(&tc->buffer_lists[i].driver_flushed_fence);
5186 util_queue_fence_destroy(&tc->buffer_lists[i].driver_flushed_fence);
5187 }
5188
5189 for (unsigned i = 0; i < ARRAY_SIZE(tc->fb_resources); i++)
5190 pipe_resource_reference(&tc->fb_resources[i], NULL);
5191 pipe_resource_reference(&tc->fb_resolve, NULL);
5192
5193 FREE(tc);
5194 }
5195
tc_driver_internal_flush_notify(struct threaded_context * tc)5196 void tc_driver_internal_flush_notify(struct threaded_context *tc)
5197 {
5198 /* Allow drivers to call this function even for internal contexts that
5199 * don't have tc. It simplifies drivers.
5200 */
5201 if (!tc)
5202 return;
5203
5204 /* Signal fences set by tc_batch_execute. */
5205 for (unsigned i = 0; i < tc->num_signal_fences_next_flush; i++)
5206 util_queue_fence_signal(tc->signal_fences_next_flush[i]);
5207
5208 tc->num_signal_fences_next_flush = 0;
5209 }
5210
5211 /**
5212 * Wrap an existing pipe_context into a threaded_context.
5213 *
5214 * \param pipe pipe_context to wrap
5215 * \param parent_transfer_pool parent slab pool set up for creating pipe_-
5216 * transfer objects; the driver should have one
5217 * in pipe_screen.
5218 * \param replace_buffer callback for replacing a pipe_resource's storage
5219 * with another pipe_resource's storage.
5220 * \param options optional TC options/callbacks
5221 * \param out if successful, the threaded_context will be returned here in
5222 * addition to the return value if "out" != NULL
5223 */
5224 struct pipe_context *
threaded_context_create(struct pipe_context * pipe,struct slab_parent_pool * parent_transfer_pool,tc_replace_buffer_storage_func replace_buffer,const struct threaded_context_options * options,struct threaded_context ** out)5225 threaded_context_create(struct pipe_context *pipe,
5226 struct slab_parent_pool *parent_transfer_pool,
5227 tc_replace_buffer_storage_func replace_buffer,
5228 const struct threaded_context_options *options,
5229 struct threaded_context **out)
5230 {
5231 struct threaded_context *tc;
5232
5233 if (!pipe)
5234 return NULL;
5235
5236 if (!debug_get_bool_option("GALLIUM_THREAD", true))
5237 return pipe;
5238
5239 tc = CALLOC_STRUCT(threaded_context);
5240 if (!tc) {
5241 pipe->destroy(pipe);
5242 return NULL;
5243 }
5244
5245 if (options) {
5246 /* this is unimplementable */
5247 assert(!(options->parse_renderpass_info && options->driver_calls_flush_notify));
5248 tc->options = *options;
5249 }
5250
5251 pipe = trace_context_create_threaded(pipe->screen, pipe, &replace_buffer, &tc->options);
5252
5253 /* The driver context isn't wrapped, so set its "priv" to NULL. */
5254 pipe->priv = NULL;
5255
5256 tc->pipe = pipe;
5257 tc->replace_buffer_storage = replace_buffer;
5258 tc->map_buffer_alignment =
5259 pipe->screen->caps.min_map_buffer_alignment;
5260 tc->ubo_alignment =
5261 MAX2(pipe->screen->caps.constant_buffer_offset_alignment, 64);
5262 tc->base.priv = pipe; /* priv points to the wrapped driver context */
5263 tc->base.screen = pipe->screen;
5264 tc->base.destroy = tc_destroy;
5265 tc->base.callback = tc_callback;
5266
5267 tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader);
5268 if (pipe->stream_uploader == pipe->const_uploader)
5269 tc->base.const_uploader = tc->base.stream_uploader;
5270 else
5271 tc->base.const_uploader = u_upload_clone(&tc->base, pipe->const_uploader);
5272
5273 if (!tc->base.stream_uploader || !tc->base.const_uploader)
5274 goto fail;
5275
5276 tc->use_forced_staging_uploads = true;
5277
5278 /* The queue size is the number of batches "waiting". Batches are removed
5279 * from the queue before being executed, so keep one tc_batch slot for that
5280 * execution. Also, keep one unused slot for an unflushed batch.
5281 */
5282 if (!util_queue_init(&tc->queue, "gdrv", TC_MAX_BATCHES - 2, 1, 0, NULL))
5283 goto fail;
5284
5285 tc->last_completed = -1;
5286 for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
5287 #if !defined(NDEBUG) && TC_DEBUG >= 1
5288 tc->batch_slots[i].sentinel = TC_SENTINEL;
5289 #endif
5290 tc->batch_slots[i].tc = tc;
5291 tc->batch_slots[i].batch_idx = i;
5292 util_queue_fence_init(&tc->batch_slots[i].fence);
5293 tc->batch_slots[i].renderpass_info_idx = -1;
5294 if (tc->options.parse_renderpass_info) {
5295 util_dynarray_init(&tc->batch_slots[i].renderpass_infos, NULL);
5296 tc_batch_renderpass_infos_resize(tc, &tc->batch_slots[i]);
5297 }
5298 }
5299 for (unsigned i = 0; i < TC_MAX_BUFFER_LISTS; i++)
5300 util_queue_fence_init(&tc->buffer_lists[i].driver_flushed_fence);
5301
5302 list_inithead(&tc->unflushed_queries);
5303
5304 slab_create_child(&tc->pool_transfers, parent_transfer_pool);
5305
5306 /* If you have different limits in each shader stage, set the maximum. */
5307 struct pipe_screen *screen = pipe->screen;;
5308 tc->max_const_buffers =
5309 screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
5310 PIPE_SHADER_CAP_MAX_CONST_BUFFERS);
5311 tc->max_shader_buffers =
5312 screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
5313 PIPE_SHADER_CAP_MAX_SHADER_BUFFERS);
5314 tc->max_images =
5315 screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
5316 PIPE_SHADER_CAP_MAX_SHADER_IMAGES);
5317 tc->max_samplers =
5318 screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
5319 PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS);
5320
5321 tc->base.set_context_param = tc_set_context_param; /* always set this */
5322
5323 #define CTX_INIT(_member) \
5324 tc->base._member = tc->pipe->_member ? tc_##_member : NULL
5325
5326 CTX_INIT(flush);
5327 CTX_INIT(draw_vbo);
5328 CTX_INIT(draw_vertex_state);
5329 CTX_INIT(launch_grid);
5330 CTX_INIT(resource_copy_region);
5331 CTX_INIT(blit);
5332 CTX_INIT(clear);
5333 CTX_INIT(clear_render_target);
5334 CTX_INIT(clear_depth_stencil);
5335 CTX_INIT(clear_buffer);
5336 CTX_INIT(clear_texture);
5337 CTX_INIT(flush_resource);
5338 CTX_INIT(generate_mipmap);
5339 CTX_INIT(render_condition);
5340 CTX_INIT(create_query);
5341 CTX_INIT(create_batch_query);
5342 CTX_INIT(destroy_query);
5343 CTX_INIT(begin_query);
5344 CTX_INIT(end_query);
5345 CTX_INIT(get_query_result);
5346 CTX_INIT(get_query_result_resource);
5347 CTX_INIT(set_active_query_state);
5348 CTX_INIT(create_blend_state);
5349 CTX_INIT(bind_blend_state);
5350 CTX_INIT(delete_blend_state);
5351 CTX_INIT(create_sampler_state);
5352 CTX_INIT(bind_sampler_states);
5353 CTX_INIT(delete_sampler_state);
5354 CTX_INIT(create_rasterizer_state);
5355 CTX_INIT(bind_rasterizer_state);
5356 CTX_INIT(delete_rasterizer_state);
5357 CTX_INIT(create_depth_stencil_alpha_state);
5358 CTX_INIT(bind_depth_stencil_alpha_state);
5359 CTX_INIT(delete_depth_stencil_alpha_state);
5360 CTX_INIT(link_shader);
5361 CTX_INIT(create_fs_state);
5362 CTX_INIT(bind_fs_state);
5363 CTX_INIT(delete_fs_state);
5364 CTX_INIT(create_vs_state);
5365 CTX_INIT(bind_vs_state);
5366 CTX_INIT(delete_vs_state);
5367 CTX_INIT(create_gs_state);
5368 CTX_INIT(bind_gs_state);
5369 CTX_INIT(delete_gs_state);
5370 CTX_INIT(create_tcs_state);
5371 CTX_INIT(bind_tcs_state);
5372 CTX_INIT(delete_tcs_state);
5373 CTX_INIT(create_tes_state);
5374 CTX_INIT(bind_tes_state);
5375 CTX_INIT(delete_tes_state);
5376 CTX_INIT(create_compute_state);
5377 CTX_INIT(bind_compute_state);
5378 CTX_INIT(delete_compute_state);
5379 CTX_INIT(create_vertex_elements_state);
5380 CTX_INIT(bind_vertex_elements_state);
5381 CTX_INIT(delete_vertex_elements_state);
5382 CTX_INIT(set_blend_color);
5383 CTX_INIT(set_stencil_ref);
5384 CTX_INIT(set_sample_mask);
5385 CTX_INIT(set_min_samples);
5386 CTX_INIT(set_clip_state);
5387 CTX_INIT(set_constant_buffer);
5388 CTX_INIT(set_inlinable_constants);
5389 CTX_INIT(set_framebuffer_state);
5390 CTX_INIT(set_polygon_stipple);
5391 CTX_INIT(set_sample_locations);
5392 CTX_INIT(set_scissor_states);
5393 CTX_INIT(set_viewport_states);
5394 CTX_INIT(set_window_rectangles);
5395 CTX_INIT(set_sampler_views);
5396 CTX_INIT(set_tess_state);
5397 CTX_INIT(set_patch_vertices);
5398 CTX_INIT(set_shader_buffers);
5399 CTX_INIT(set_shader_images);
5400 CTX_INIT(set_vertex_buffers);
5401 CTX_INIT(create_stream_output_target);
5402 CTX_INIT(stream_output_target_destroy);
5403 CTX_INIT(set_stream_output_targets);
5404 CTX_INIT(create_sampler_view);
5405 CTX_INIT(sampler_view_destroy);
5406 CTX_INIT(create_surface);
5407 CTX_INIT(surface_destroy);
5408 CTX_INIT(buffer_map);
5409 CTX_INIT(texture_map);
5410 CTX_INIT(transfer_flush_region);
5411 CTX_INIT(buffer_unmap);
5412 CTX_INIT(texture_unmap);
5413 CTX_INIT(buffer_subdata);
5414 CTX_INIT(texture_subdata);
5415 CTX_INIT(texture_barrier);
5416 CTX_INIT(memory_barrier);
5417 CTX_INIT(resource_commit);
5418 CTX_INIT(create_video_codec);
5419 CTX_INIT(create_video_buffer);
5420 CTX_INIT(set_compute_resources);
5421 CTX_INIT(set_global_binding);
5422 CTX_INIT(get_sample_position);
5423 CTX_INIT(invalidate_resource);
5424 CTX_INIT(get_device_reset_status);
5425 CTX_INIT(set_device_reset_callback);
5426 CTX_INIT(dump_debug_state);
5427 CTX_INIT(set_log_context);
5428 CTX_INIT(emit_string_marker);
5429 CTX_INIT(set_debug_callback);
5430 CTX_INIT(create_fence_fd);
5431 CTX_INIT(fence_server_sync);
5432 CTX_INIT(fence_server_signal);
5433 CTX_INIT(get_timestamp);
5434 CTX_INIT(create_texture_handle);
5435 CTX_INIT(delete_texture_handle);
5436 CTX_INIT(make_texture_handle_resident);
5437 CTX_INIT(create_image_handle);
5438 CTX_INIT(delete_image_handle);
5439 CTX_INIT(make_image_handle_resident);
5440 CTX_INIT(set_frontend_noop);
5441 CTX_INIT(init_intel_perf_query_info);
5442 CTX_INIT(get_intel_perf_query_info);
5443 CTX_INIT(get_intel_perf_query_counter_info);
5444 CTX_INIT(new_intel_perf_query_obj);
5445 CTX_INIT(begin_intel_perf_query);
5446 CTX_INIT(end_intel_perf_query);
5447 CTX_INIT(delete_intel_perf_query);
5448 CTX_INIT(wait_intel_perf_query);
5449 CTX_INIT(is_intel_perf_query_ready);
5450 CTX_INIT(get_intel_perf_query_data);
5451 #undef CTX_INIT
5452
5453 if (out)
5454 *out = tc;
5455
5456 tc_begin_next_buffer_list(tc);
5457 if (tc->options.parse_renderpass_info)
5458 tc_batch_increment_renderpass_info(tc, tc->next, false);
5459 return &tc->base;
5460
5461 fail:
5462 tc_destroy(&tc->base);
5463 return NULL;
5464 }
5465
5466 void
threaded_context_init_bytes_mapped_limit(struct threaded_context * tc,unsigned divisor)5467 threaded_context_init_bytes_mapped_limit(struct threaded_context *tc, unsigned divisor)
5468 {
5469 uint64_t total_ram;
5470 if (os_get_total_physical_memory(&total_ram)) {
5471 tc->bytes_mapped_limit = total_ram / divisor;
5472 if (sizeof(void*) == 4)
5473 tc->bytes_mapped_limit = MIN2(tc->bytes_mapped_limit, 512*1024*1024UL);
5474 }
5475 }
5476
5477 const struct tc_renderpass_info *
threaded_context_get_renderpass_info(struct threaded_context * tc)5478 threaded_context_get_renderpass_info(struct threaded_context *tc)
5479 {
5480 assert(tc->renderpass_info && tc->options.parse_renderpass_info);
5481 struct tc_batch_rp_info *info = tc_batch_rp_info(tc->renderpass_info);
5482 while (1) {
5483 util_queue_fence_wait(&info->ready);
5484 if (!info->next)
5485 return &info->info;
5486 info = info->next;
5487 }
5488 }
5489