1 /*
2 * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #include "pipe/p_screen.h"
10 #include "pipe/p_state.h"
11 #include "tgsi/tgsi_dump.h"
12 #include "util/format/u_format.h"
13 #include "util/u_inlines.h"
14 #include "util/u_memory.h"
15 #include "util/u_string.h"
16
17 #include "nir/tgsi_to_nir.h"
18 #include "nir_serialize.h"
19
20 #include "freedreno_context.h"
21 #include "freedreno_util.h"
22
23 #include "ir3/ir3_cache.h"
24 #include "ir3/ir3_compiler.h"
25 #include "ir3/ir3_descriptor.h"
26 #include "ir3/ir3_gallium.h"
27 #include "ir3/ir3_nir.h"
28 #include "ir3/ir3_shader.h"
29
30 /**
31 * The hardware cso for shader state
32 *
33 * Initially just a container for the ir3_shader, but this is where we'll
34 * plumb in async compile.
35 */
36 struct ir3_shader_state {
37 struct ir3_shader *shader;
38
39 /* Fence signalled when async compile is completed: */
40 struct util_queue_fence ready;
41 };
42
43 /**
44 * Should initial variants be compiled synchronously?
45 *
46 * The only case where util_debug_message() is used in the initial-variants
47 * path is with FD_MESA_DEBUG=shaderdb. So if either debug is disabled (ie.
48 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
49 * compile the initial shader variant asynchronously.
50 */
51 static bool
initial_variants_synchronous(struct fd_context * ctx)52 initial_variants_synchronous(struct fd_context *ctx)
53 {
54 return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
55 FD_DBG(SERIALC);
56 }
57
58 static void
dump_shader_info(struct ir3_shader_variant * v,struct util_debug_callback * debug)59 dump_shader_info(struct ir3_shader_variant *v,
60 struct util_debug_callback *debug)
61 {
62 if (!FD_DBG(SHADERDB))
63 return;
64
65 util_debug_message(
66 debug, SHADER_INFO,
67 "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
68 "%u dwords, %u last-baryf, %u last-helper, %u half, %u full, %u constlen, "
69 "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
70 "%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
71 "%d loops, %u preamble inst, %d early-preamble\n",
72 ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
73 v->info.instrs_count - v->info.nops_count, v->info.mov_count,
74 v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
75 v->info.last_helper, v->info.max_half_reg + 1, v->info.max_reg + 1,
76 v->constlen,
77 v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
78 v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
79 v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
80 v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
81 v->info.stp_count, v->info.ldp_count, v->info.sstall,
82 v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops,
83 v->info.preamble_instrs_count, v->info.early_preamble);
84 }
85
86 static void
upload_shader_variant(struct ir3_shader_variant * v)87 upload_shader_variant(struct ir3_shader_variant *v)
88 {
89 struct ir3_compiler *compiler = v->compiler;
90
91 assert(!v->bo);
92
93 v->bo =
94 fd_bo_new(compiler->dev, v->info.size, FD_BO_NOMAP,
95 "%s:%s", ir3_shader_stage(v), v->name);
96
97 /* Always include shaders in kernel crash dumps. */
98 fd_bo_mark_for_dump(v->bo);
99
100 fd_bo_upload(v->bo, v->bin, 0, v->info.size);
101 }
102
103 struct ir3_shader_variant *
ir3_shader_variant(struct ir3_shader * shader,struct ir3_shader_key key,bool binning_pass,struct util_debug_callback * debug)104 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
105 bool binning_pass, struct util_debug_callback *debug)
106 {
107 struct ir3_shader_variant *v;
108 bool created = false;
109
110 MESA_TRACE_FUNC();
111
112 /* Some shader key values may not be used by a given ir3_shader (for
113 * example, fragment shader saturates in the vertex shader), so clean out
114 * those flags to avoid recompiling.
115 */
116 ir3_key_clear_unused(&key, shader);
117
118 v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
119
120 if (created) {
121 if (shader->initial_variants_done) {
122 perf_debug_message(debug, SHADER_INFO,
123 "%s shader: recompiling at draw time: global "
124 "0x%08x, vfsamples %x/%x, astc %x/%x\n",
125 ir3_shader_stage(v), key.global, key.vsamples,
126 key.fsamples, key.vastc_srgb, key.fastc_srgb);
127 }
128
129 dump_shader_info(v, debug);
130 upload_shader_variant(v);
131
132 if (v->binning) {
133 upload_shader_variant(v->binning);
134 dump_shader_info(v->binning, debug);
135 }
136 }
137
138 return v;
139 }
140
141 static void
copy_stream_out(struct ir3_stream_output_info * i,const struct pipe_stream_output_info * p)142 copy_stream_out(struct ir3_stream_output_info *i,
143 const struct pipe_stream_output_info *p)
144 {
145 STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
146 STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
147
148 i->streams_written = 0;
149 i->num_outputs = p->num_outputs;
150 for (int n = 0; n < ARRAY_SIZE(i->stride); n++) {
151 i->stride[n] = p->stride[n];
152 if (p->stride[n])
153 i->streams_written |= BIT(n);
154 }
155
156 for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
157 i->output[n].register_index = p->output[n].register_index;
158 i->output[n].start_component = p->output[n].start_component;
159 i->output[n].num_components = p->output[n].num_components;
160 i->output[n].output_buffer = p->output[n].output_buffer;
161 i->output[n].dst_offset = p->output[n].dst_offset;
162 i->output[n].stream = p->output[n].stream;
163 }
164 }
165
166 static void
create_initial_variants(struct ir3_shader_state * hwcso,struct util_debug_callback * debug)167 create_initial_variants(struct ir3_shader_state *hwcso,
168 struct util_debug_callback *debug)
169 {
170 struct ir3_shader *shader = hwcso->shader;
171 struct ir3_compiler *compiler = shader->compiler;
172 nir_shader *nir = shader->nir;
173
174 /* Compile standard variants immediately to try to avoid draw-time stalls
175 * to run the compiler.
176 */
177 struct ir3_shader_key key = {
178 .tessellation = IR3_TESS_NONE,
179 .ucp_enables = MASK(nir->info.clip_distance_array_size),
180 .msaa = true,
181 };
182
183 switch (nir->info.stage) {
184 case MESA_SHADER_TESS_EVAL:
185 key.tessellation = ir3_tess_mode(nir->info.tess._primitive_mode);
186 break;
187
188 case MESA_SHADER_TESS_CTRL:
189 /* The primitive_mode field, while it exists for TCS, is not
190 * populated (since separable shaders between TCS/TES are legal,
191 * so TCS wouldn't have access to TES's declaration). Make a
192 * guess so that we shader-db something plausible for TCS.
193 */
194 if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
195 key.tessellation = IR3_TESS_TRIANGLES;
196 else
197 key.tessellation = IR3_TESS_ISOLINES;
198 break;
199
200 case MESA_SHADER_GEOMETRY:
201 key.has_gs = true;
202 break;
203
204 default:
205 break;
206 }
207
208 key.safe_constlen = false;
209 struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
210 if (!v)
211 return;
212
213 if (v->constlen > compiler->max_const_safe) {
214 key.safe_constlen = true;
215 ir3_shader_variant(shader, key, false, debug);
216 }
217
218 /* For vertex shaders, also compile initial binning pass shader: */
219 if (nir->info.stage == MESA_SHADER_VERTEX) {
220 key.safe_constlen = false;
221 v = ir3_shader_variant(shader, key, true, debug);
222 if (!v)
223 return;
224
225 if (v->constlen > compiler->max_const_safe) {
226 key.safe_constlen = true;
227 ir3_shader_variant(shader, key, true, debug);
228 }
229 }
230
231 shader->initial_variants_done = true;
232 }
233
234 static void
create_initial_variants_async(void * job,void * gdata,int thread_index)235 create_initial_variants_async(void *job, void *gdata, int thread_index)
236 {
237 struct ir3_shader_state *hwcso = job;
238 struct util_debug_callback debug = {};
239
240 MESA_TRACE_FUNC();
241
242 create_initial_variants(hwcso, &debug);
243 }
244
245 static void
create_initial_compute_variants_async(void * job,void * gdata,int thread_index)246 create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
247 {
248 struct ir3_shader_state *hwcso = job;
249 struct ir3_shader *shader = hwcso->shader;
250 struct util_debug_callback debug = {};
251 static struct ir3_shader_key key; /* static is implicitly zeroed */
252
253 MESA_TRACE_FUNC();
254
255 ir3_shader_variant(shader, key, false, &debug);
256 shader->initial_variants_done = true;
257 }
258
259 /* a bit annoying that compute-shader and normal shader state objects
260 * aren't a bit more aligned.
261 */
262 void *
ir3_shader_compute_state_create(struct pipe_context * pctx,const struct pipe_compute_state * cso)263 ir3_shader_compute_state_create(struct pipe_context *pctx,
264 const struct pipe_compute_state *cso)
265 {
266 struct fd_context *ctx = fd_context(pctx);
267
268 /* req_input_mem will only be non-zero for cl kernels (ie. clover).
269 * This isn't a perfect test because I guess it is possible (but
270 * uncommon) for none for the kernel parameters to be a global,
271 * but ctx->set_global_bindings() can't fail, so this is the next
272 * best place to fail if we need a newer version of kernel driver:
273 */
274 if ((cso->req_input_mem > 0) &&
275 fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
276 return NULL;
277 }
278
279 enum ir3_wavesize_option api_wavesize = IR3_SINGLE_OR_DOUBLE;
280 enum ir3_wavesize_option real_wavesize = IR3_SINGLE_OR_DOUBLE;
281
282 const struct ir3_shader_options ir3_options = {
283 /* TODO: force to single on a6xx with legacy ballot extension that uses
284 * 64-bit masks
285 */
286 .api_wavesize = api_wavesize,
287 .real_wavesize = real_wavesize,
288 };
289
290 struct ir3_compiler *compiler = ctx->screen->compiler;
291 nir_shader *nir;
292
293 if (cso->ir_type == PIPE_SHADER_IR_NIR) {
294 /* we take ownership of the reference: */
295 nir = (nir_shader *)cso->prog;
296 } else {
297 assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
298 if (ir3_shader_debug & IR3_DBG_DISASM) {
299 tgsi_dump(cso->prog, 0);
300 }
301 nir = tgsi_to_nir(cso->prog, pctx->screen, false);
302 }
303
304 if (ctx->screen->gen >= 6)
305 ir3_nir_lower_io_to_bindless(nir);
306
307 if (ctx->screen->gen >= 6 && !ctx->screen->info->a6xx.supports_double_threadsize) {
308 api_wavesize = IR3_SINGLE_ONLY;
309 real_wavesize = IR3_SINGLE_ONLY;
310 }
311
312 struct ir3_shader *shader =
313 ir3_shader_from_nir(compiler, nir, &ir3_options, NULL);
314 shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */
315 shader->cs.req_local_mem = cso->static_shared_mem;
316
317 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
318
319 util_queue_fence_init(&hwcso->ready);
320 hwcso->shader = shader;
321
322 /* Immediately compile a standard variant. We have so few variants in our
323 * shaders, that doing so almost eliminates draw-time recompiles. (This
324 * is also how we get data from shader-db's ./run)
325 */
326
327 if (initial_variants_synchronous(ctx)) {
328 static struct ir3_shader_key key; /* static is implicitly zeroed */
329 ir3_shader_variant(shader, key, false, &ctx->debug);
330 shader->initial_variants_done = true;
331 } else {
332 struct fd_screen *screen = ctx->screen;
333 util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
334 create_initial_compute_variants_async, NULL, 0);
335 }
336
337 return hwcso;
338 }
339
340 void *
ir3_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)341 ir3_shader_state_create(struct pipe_context *pctx,
342 const struct pipe_shader_state *cso)
343 {
344 struct fd_context *ctx = fd_context(pctx);
345 struct ir3_compiler *compiler = ctx->screen->compiler;
346 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
347
348 /*
349 * Convert to nir (if necessary):
350 */
351
352 nir_shader *nir;
353 if (cso->type == PIPE_SHADER_IR_NIR) {
354 /* we take ownership of the reference: */
355 nir = cso->ir.nir;
356 } else {
357 assert(cso->type == PIPE_SHADER_IR_TGSI);
358 if (ir3_shader_debug & IR3_DBG_DISASM) {
359 tgsi_dump(cso->tokens, 0);
360 }
361 nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
362 }
363
364 if (ctx->screen->gen >= 6)
365 ir3_nir_lower_io_to_bindless(nir);
366
367 /*
368 * Create ir3_shader:
369 *
370 * This part is cheap, it doesn't compile initial variants
371 */
372
373 struct ir3_stream_output_info stream_output = {};
374 copy_stream_out(&stream_output, &cso->stream_output);
375
376 hwcso->shader =
377 ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
378 /* TODO: force to single on a6xx with legacy
379 * ballot extension that uses 64-bit masks
380 */
381 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
382 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
383 },
384 &stream_output);
385
386 /*
387 * Create initial variants to avoid draw-time stalls. This is
388 * normally done asynchronously, unless debug is enabled (which
389 * will be the case for shader-db)
390 */
391
392 util_queue_fence_init(&hwcso->ready);
393
394 if (initial_variants_synchronous(ctx)) {
395 create_initial_variants(hwcso, &ctx->debug);
396 } else {
397 util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
398 create_initial_variants_async, NULL, 0);
399 }
400
401 return hwcso;
402 }
403
404 void
ir3_shader_state_delete(struct pipe_context * pctx,void * _hwcso)405 ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
406 {
407 struct fd_context *ctx = fd_context(pctx);
408 struct fd_screen *screen = ctx->screen;
409 struct ir3_shader_state *hwcso = _hwcso;
410 struct ir3_shader *so = hwcso->shader;
411
412 ir3_cache_invalidate(ctx->shader_cache, hwcso);
413
414 /* util_queue_drop_job() guarantees that either:
415 * 1) job did not execute
416 * 2) job completed
417 *
418 * In either case the fence is signaled
419 */
420 util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
421
422 /* free the uploaded shaders, since this is handled outside of the
423 * shared ir3 code (ie. not used by turnip):
424 */
425 for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
426 fd_bo_del(v->bo);
427 v->bo = NULL;
428
429 if (v->binning && v->binning->bo) {
430 fd_bo_del(v->binning->bo);
431 v->binning->bo = NULL;
432 }
433 }
434
435 ir3_shader_destroy(so);
436 util_queue_fence_destroy(&hwcso->ready);
437 free(hwcso);
438 }
439
440 struct ir3_shader *
ir3_get_shader(struct ir3_shader_state * hwcso)441 ir3_get_shader(struct ir3_shader_state *hwcso)
442 {
443 if (!hwcso)
444 return NULL;
445
446 MESA_TRACE_FUNC();
447
448 struct ir3_shader *shader = hwcso->shader;
449 perf_time (1000, "waited for %s:%s:%s variants",
450 _mesa_shader_stage_to_abbrev(shader->type),
451 shader->nir->info.name,
452 shader->nir->info.label) {
453 /* wait for initial variants to compile: */
454 util_queue_fence_wait(&hwcso->ready);
455 }
456
457 return shader;
458 }
459
460 struct shader_info *
ir3_get_shader_info(struct ir3_shader_state * hwcso)461 ir3_get_shader_info(struct ir3_shader_state *hwcso)
462 {
463 if (!hwcso)
464 return NULL;
465 return &hwcso->shader->nir->info;
466 }
467
468 /* fixup dirty shader state in case some "unrelated" (from the state-
469 * tracker's perspective) state change causes us to switch to a
470 * different variant.
471 */
472 void
ir3_fixup_shader_state(struct pipe_context * pctx,struct ir3_shader_key * key)473 ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
474 {
475 struct fd_context *ctx = fd_context(pctx);
476
477 if (!ir3_shader_key_equal(ctx->last.key, key)) {
478 if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
479 fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
480 FD_DIRTY_SHADER_PROG);
481 }
482
483 if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
484 fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
485 }
486
487 /* NOTE: currently only a6xx has gs/tess, but needs no
488 * gs/tess specific lowering.
489 */
490
491 *ctx->last.key = *key;
492 }
493 }
494
495 static char *
ir3_screen_finalize_nir(struct pipe_screen * pscreen,struct nir_shader * nir)496 ir3_screen_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir)
497 {
498 struct fd_screen *screen = fd_screen(pscreen);
499
500 const struct ir3_shader_nir_options options = {};
501
502 MESA_TRACE_FUNC();
503
504 ir3_nir_lower_io_to_temporaries(nir);
505 ir3_finalize_nir(screen->compiler, &options, nir);
506
507 return NULL;
508 }
509
510 static void
ir3_set_max_shader_compiler_threads(struct pipe_screen * pscreen,unsigned max_threads)511 ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
512 unsigned max_threads)
513 {
514 struct fd_screen *screen = fd_screen(pscreen);
515
516 /* This function doesn't allow a greater number of threads than
517 * the queue had at its creation.
518 */
519 util_queue_adjust_num_threads(&screen->compile_queue, max_threads,
520 false);
521 }
522
523 static bool
ir3_is_parallel_shader_compilation_finished(struct pipe_screen * pscreen,void * shader,enum pipe_shader_type shader_type)524 ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
525 void *shader,
526 enum pipe_shader_type shader_type)
527 {
528 struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
529
530 return util_queue_fence_is_signalled(&hwcso->ready);
531 }
532
533 void
ir3_prog_init(struct pipe_context * pctx)534 ir3_prog_init(struct pipe_context *pctx)
535 {
536 pctx->create_vs_state = ir3_shader_state_create;
537 pctx->delete_vs_state = ir3_shader_state_delete;
538
539 pctx->create_tcs_state = ir3_shader_state_create;
540 pctx->delete_tcs_state = ir3_shader_state_delete;
541
542 pctx->create_tes_state = ir3_shader_state_create;
543 pctx->delete_tes_state = ir3_shader_state_delete;
544
545 pctx->create_gs_state = ir3_shader_state_create;
546 pctx->delete_gs_state = ir3_shader_state_delete;
547
548 pctx->create_fs_state = ir3_shader_state_create;
549 pctx->delete_fs_state = ir3_shader_state_delete;
550 }
551
552 void
ir3_screen_init(struct pipe_screen * pscreen)553 ir3_screen_init(struct pipe_screen *pscreen)
554 {
555 struct fd_screen *screen = fd_screen(pscreen);
556
557 struct ir3_compiler_options options = {
558 .bindless_fb_read_descriptor =
559 ir3_shader_descriptor_set(PIPE_SHADER_FRAGMENT),
560 .bindless_fb_read_slot = IR3_BINDLESS_IMAGE_OFFSET +
561 IR3_BINDLESS_IMAGE_COUNT - 1 - screen->max_rts,
562 .dual_color_blend_by_location = screen->driconf.dual_color_blend_by_location,
563 };
564
565 if (screen->gen >= 6) {
566 options.lower_base_vertex = true;
567 }
568
569 if (screen->gen >= 7) {
570 options.push_ubo_with_preamble = true;
571 }
572
573 screen->compiler =
574 ir3_compiler_create(screen->dev, screen->dev_id, screen->info, &options);
575
576 /* TODO do we want to limit things to # of fast cores, or just limit
577 * based on total # of both big and little cores. The little cores
578 * tend to be in-order and probably much slower for compiling than
579 * big cores. OTOH if they are sitting idle, maybe it is useful to
580 * use them?
581 */
582 unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) / 2;
583
584 /* Create at least one thread - even on single core CPU systems. */
585 num_threads = MAX2(1, num_threads);
586
587 util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
588 UTIL_QUEUE_INIT_RESIZE_IF_FULL |
589 UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
590
591 pscreen->finalize_nir = ir3_screen_finalize_nir;
592 pscreen->set_max_shader_compiler_threads =
593 ir3_set_max_shader_compiler_threads;
594 pscreen->is_parallel_shader_compilation_finished =
595 ir3_is_parallel_shader_compilation_finished;
596 }
597
598 void
ir3_screen_fini(struct pipe_screen * pscreen)599 ir3_screen_fini(struct pipe_screen *pscreen)
600 {
601 struct fd_screen *screen = fd_screen(pscreen);
602
603 util_queue_destroy(&screen->compile_queue);
604 ir3_compiler_destroy(screen->compiler);
605 screen->compiler = NULL;
606 }
607
608 void
ir3_update_max_tf_vtx(struct fd_context * ctx,const struct ir3_shader_variant * v)609 ir3_update_max_tf_vtx(struct fd_context *ctx,
610 const struct ir3_shader_variant *v)
611 {
612 struct fd_streamout_stateobj *so = &ctx->streamout;
613 const struct ir3_stream_output_info *info = &v->stream_output;
614 uint32_t maxvtxcnt = 0x7fffffff;
615
616 if (v->stream_output.num_outputs == 0)
617 maxvtxcnt = 0;
618 if (so->num_targets == 0)
619 maxvtxcnt = 0;
620
621 /* offset to write to is:
622 *
623 * total_vtxcnt = vtxcnt + offsets[i]
624 * offset = total_vtxcnt * stride[i]
625 *
626 * offset = vtxcnt * stride[i] ; calculated in shader
627 * + offsets[i] * stride[i] ; calculated at emit_tfbos()
628 *
629 * assuming for each vtx, each target buffer will have data written
630 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
631 *
632 * buffer_size = (maxvtxcnt * stride[i]) + stride[i]
633 * maxvtxcnt = (buffer_size - stride[i]) / stride[i]
634 *
635 * but shader is actually doing a less-than (rather than less-than-
636 * equal) check, so we can drop the -stride[i].
637 *
638 * TODO is assumption about `offset + stride[i]` legit?
639 */
640 for (unsigned i = 0; i < so->num_targets; i++) {
641 struct pipe_stream_output_target *target = so->targets[i];
642 unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
643 if (target) {
644 uint32_t max = target->buffer_size / stride;
645 maxvtxcnt = MIN2(maxvtxcnt, max);
646 }
647 }
648
649 ctx->streamout.max_tf_vtx = maxvtxcnt;
650 }
651
652 void
ir3_get_private_mem(struct fd_context * ctx,const struct ir3_shader_variant * so)653 ir3_get_private_mem(struct fd_context *ctx, const struct ir3_shader_variant *so)
654 {
655 uint32_t fibers_per_sp = ctx->screen->info->fibers_per_sp;
656 uint32_t num_sp_cores = ctx->screen->info->num_sp_cores;
657
658 uint32_t per_fiber_size = so->pvtmem_size;
659 if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) {
660 if (ctx->pvtmem[so->pvtmem_per_wave].bo)
661 fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo);
662
663 uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12);
664 uint32_t total_size = per_sp_size * num_sp_cores;
665
666 ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size;
667 ctx->pvtmem[so->pvtmem_per_wave].per_sp_size = per_sp_size;
668 ctx->pvtmem[so->pvtmem_per_wave].bo = fd_bo_new(
669 ctx->screen->dev, total_size, FD_BO_NOMAP, "pvtmem_%s_%d",
670 so->pvtmem_per_wave ? "per_wave" : "per_fiber", per_fiber_size);
671 }
672 }
673