• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <robclark@freedesktop.org>
7  */
8 
9 #include "pipe/p_screen.h"
10 #include "pipe/p_state.h"
11 #include "tgsi/tgsi_dump.h"
12 #include "util/format/u_format.h"
13 #include "util/u_inlines.h"
14 #include "util/u_memory.h"
15 #include "util/u_string.h"
16 
17 #include "nir/tgsi_to_nir.h"
18 #include "nir_serialize.h"
19 
20 #include "freedreno_context.h"
21 #include "freedreno_util.h"
22 
23 #include "ir3/ir3_cache.h"
24 #include "ir3/ir3_compiler.h"
25 #include "ir3/ir3_descriptor.h"
26 #include "ir3/ir3_gallium.h"
27 #include "ir3/ir3_nir.h"
28 #include "ir3/ir3_shader.h"
29 
30 /**
31  * The hardware cso for shader state
32  *
33  * Initially just a container for the ir3_shader, but this is where we'll
34  * plumb in async compile.
35  */
36 struct ir3_shader_state {
37    struct ir3_shader *shader;
38 
39    /* Fence signalled when async compile is completed: */
40    struct util_queue_fence ready;
41 };
42 
43 /**
44  * Should initial variants be compiled synchronously?
45  *
46  * The only case where util_debug_message() is used in the initial-variants
47  * path is with FD_MESA_DEBUG=shaderdb.  So if either debug is disabled (ie.
48  * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
49  * compile the initial shader variant asynchronously.
50  */
51 static bool
initial_variants_synchronous(struct fd_context * ctx)52 initial_variants_synchronous(struct fd_context *ctx)
53 {
54    return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
55           FD_DBG(SERIALC);
56 }
57 
58 static void
dump_shader_info(struct ir3_shader_variant * v,struct util_debug_callback * debug)59 dump_shader_info(struct ir3_shader_variant *v,
60                  struct util_debug_callback *debug)
61 {
62    if (!FD_DBG(SHADERDB))
63       return;
64 
65    util_debug_message(
66       debug, SHADER_INFO,
67       "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
68       "%u dwords, %u last-baryf, %u last-helper, %u half, %u full, %u constlen, "
69       "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
70       "%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
71       "%d loops, %u preamble inst, %d early-preamble\n",
72       ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
73       v->info.instrs_count - v->info.nops_count, v->info.mov_count,
74       v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
75       v->info.last_helper, v->info.max_half_reg + 1, v->info.max_reg + 1,
76       v->constlen,
77       v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
78       v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
79       v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
80       v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
81       v->info.stp_count, v->info.ldp_count, v->info.sstall,
82       v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops,
83       v->info.preamble_instrs_count, v->info.early_preamble);
84 }
85 
86 static void
upload_shader_variant(struct ir3_shader_variant * v)87 upload_shader_variant(struct ir3_shader_variant *v)
88 {
89    struct ir3_compiler *compiler = v->compiler;
90 
91    assert(!v->bo);
92 
93    v->bo =
94       fd_bo_new(compiler->dev, v->info.size, FD_BO_NOMAP,
95                 "%s:%s", ir3_shader_stage(v), v->name);
96 
97    /* Always include shaders in kernel crash dumps. */
98    fd_bo_mark_for_dump(v->bo);
99 
100    fd_bo_upload(v->bo, v->bin, 0, v->info.size);
101 }
102 
103 struct ir3_shader_variant *
ir3_shader_variant(struct ir3_shader * shader,struct ir3_shader_key key,bool binning_pass,struct util_debug_callback * debug)104 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
105                    bool binning_pass, struct util_debug_callback *debug)
106 {
107    struct ir3_shader_variant *v;
108    bool created = false;
109 
110    MESA_TRACE_FUNC();
111 
112    /* Some shader key values may not be used by a given ir3_shader (for
113     * example, fragment shader saturates in the vertex shader), so clean out
114     * those flags to avoid recompiling.
115     */
116    ir3_key_clear_unused(&key, shader);
117 
118    v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
119 
120    if (created) {
121       if (shader->initial_variants_done) {
122          perf_debug_message(debug, SHADER_INFO,
123                             "%s shader: recompiling at draw time: global "
124                             "0x%08x, vfsamples %x/%x, astc %x/%x\n",
125                             ir3_shader_stage(v), key.global, key.vsamples,
126                             key.fsamples, key.vastc_srgb, key.fastc_srgb);
127       }
128 
129       dump_shader_info(v, debug);
130       upload_shader_variant(v);
131 
132       if (v->binning) {
133          upload_shader_variant(v->binning);
134          dump_shader_info(v->binning, debug);
135       }
136    }
137 
138    return v;
139 }
140 
141 static void
copy_stream_out(struct ir3_stream_output_info * i,const struct pipe_stream_output_info * p)142 copy_stream_out(struct ir3_stream_output_info *i,
143                 const struct pipe_stream_output_info *p)
144 {
145    STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
146    STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
147 
148    i->streams_written = 0;
149    i->num_outputs = p->num_outputs;
150    for (int n = 0; n < ARRAY_SIZE(i->stride); n++) {
151       i->stride[n] = p->stride[n];
152       if (p->stride[n])
153          i->streams_written |= BIT(n);
154    }
155 
156    for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
157       i->output[n].register_index = p->output[n].register_index;
158       i->output[n].start_component = p->output[n].start_component;
159       i->output[n].num_components = p->output[n].num_components;
160       i->output[n].output_buffer = p->output[n].output_buffer;
161       i->output[n].dst_offset = p->output[n].dst_offset;
162       i->output[n].stream = p->output[n].stream;
163    }
164 }
165 
166 static void
create_initial_variants(struct ir3_shader_state * hwcso,struct util_debug_callback * debug)167 create_initial_variants(struct ir3_shader_state *hwcso,
168                         struct util_debug_callback *debug)
169 {
170    struct ir3_shader *shader = hwcso->shader;
171    struct ir3_compiler *compiler = shader->compiler;
172    nir_shader *nir = shader->nir;
173 
174    /* Compile standard variants immediately to try to avoid draw-time stalls
175     * to run the compiler.
176     */
177    struct ir3_shader_key key = {
178       .tessellation = IR3_TESS_NONE,
179       .ucp_enables = MASK(nir->info.clip_distance_array_size),
180       .msaa = true,
181    };
182 
183    switch (nir->info.stage) {
184    case MESA_SHADER_TESS_EVAL:
185       key.tessellation = ir3_tess_mode(nir->info.tess._primitive_mode);
186       break;
187 
188    case MESA_SHADER_TESS_CTRL:
189       /* The primitive_mode field, while it exists for TCS, is not
190        * populated (since separable shaders between TCS/TES are legal,
191        * so TCS wouldn't have access to TES's declaration).  Make a
192        * guess so that we shader-db something plausible for TCS.
193        */
194       if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
195          key.tessellation = IR3_TESS_TRIANGLES;
196       else
197          key.tessellation = IR3_TESS_ISOLINES;
198       break;
199 
200    case MESA_SHADER_GEOMETRY:
201       key.has_gs = true;
202       break;
203 
204    default:
205       break;
206    }
207 
208    key.safe_constlen = false;
209    struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
210    if (!v)
211       return;
212 
213    if (v->constlen > compiler->max_const_safe) {
214       key.safe_constlen = true;
215       ir3_shader_variant(shader, key, false, debug);
216    }
217 
218    /* For vertex shaders, also compile initial binning pass shader: */
219    if (nir->info.stage == MESA_SHADER_VERTEX) {
220       key.safe_constlen = false;
221       v = ir3_shader_variant(shader, key, true, debug);
222       if (!v)
223          return;
224 
225       if (v->constlen > compiler->max_const_safe) {
226          key.safe_constlen = true;
227          ir3_shader_variant(shader, key, true, debug);
228       }
229    }
230 
231    shader->initial_variants_done = true;
232 }
233 
234 static void
create_initial_variants_async(void * job,void * gdata,int thread_index)235 create_initial_variants_async(void *job, void *gdata, int thread_index)
236 {
237    struct ir3_shader_state *hwcso = job;
238    struct util_debug_callback debug = {};
239 
240    MESA_TRACE_FUNC();
241 
242    create_initial_variants(hwcso, &debug);
243 }
244 
245 static void
create_initial_compute_variants_async(void * job,void * gdata,int thread_index)246 create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
247 {
248    struct ir3_shader_state *hwcso = job;
249    struct ir3_shader *shader = hwcso->shader;
250    struct util_debug_callback debug = {};
251    static struct ir3_shader_key key; /* static is implicitly zeroed */
252 
253    MESA_TRACE_FUNC();
254 
255    ir3_shader_variant(shader, key, false, &debug);
256    shader->initial_variants_done = true;
257 }
258 
259 /* a bit annoying that compute-shader and normal shader state objects
260  * aren't a bit more aligned.
261  */
262 void *
ir3_shader_compute_state_create(struct pipe_context * pctx,const struct pipe_compute_state * cso)263 ir3_shader_compute_state_create(struct pipe_context *pctx,
264                                 const struct pipe_compute_state *cso)
265 {
266    struct fd_context *ctx = fd_context(pctx);
267 
268    /* req_input_mem will only be non-zero for cl kernels (ie. clover).
269     * This isn't a perfect test because I guess it is possible (but
270     * uncommon) for none for the kernel parameters to be a global,
271     * but ctx->set_global_bindings() can't fail, so this is the next
272     * best place to fail if we need a newer version of kernel driver:
273     */
274    if ((cso->req_input_mem > 0) &&
275        fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
276       return NULL;
277    }
278 
279    enum ir3_wavesize_option api_wavesize = IR3_SINGLE_OR_DOUBLE;
280    enum ir3_wavesize_option real_wavesize = IR3_SINGLE_OR_DOUBLE;
281 
282    const struct ir3_shader_options ir3_options = {
283       /* TODO: force to single on a6xx with legacy ballot extension that uses
284        * 64-bit masks
285        */
286       .api_wavesize = api_wavesize,
287       .real_wavesize = real_wavesize,
288    };
289 
290    struct ir3_compiler *compiler = ctx->screen->compiler;
291    nir_shader *nir;
292 
293    if (cso->ir_type == PIPE_SHADER_IR_NIR) {
294       /* we take ownership of the reference: */
295       nir = (nir_shader *)cso->prog;
296    } else {
297       assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
298       if (ir3_shader_debug & IR3_DBG_DISASM) {
299          tgsi_dump(cso->prog, 0);
300       }
301       nir = tgsi_to_nir(cso->prog, pctx->screen, false);
302    }
303 
304    if (ctx->screen->gen >= 6)
305       ir3_nir_lower_io_to_bindless(nir);
306 
307    if (ctx->screen->gen >= 6 && !ctx->screen->info->a6xx.supports_double_threadsize) {
308       api_wavesize = IR3_SINGLE_ONLY;
309       real_wavesize = IR3_SINGLE_ONLY;
310    }
311 
312    struct ir3_shader *shader =
313       ir3_shader_from_nir(compiler, nir, &ir3_options, NULL);
314    shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4;     /* byte->dword */
315    shader->cs.req_local_mem = cso->static_shared_mem;
316 
317    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
318 
319    util_queue_fence_init(&hwcso->ready);
320    hwcso->shader = shader;
321 
322    /* Immediately compile a standard variant.  We have so few variants in our
323     * shaders, that doing so almost eliminates draw-time recompiles.  (This
324     * is also how we get data from shader-db's ./run)
325     */
326 
327    if (initial_variants_synchronous(ctx)) {
328       static struct ir3_shader_key key; /* static is implicitly zeroed */
329       ir3_shader_variant(shader, key, false, &ctx->debug);
330       shader->initial_variants_done = true;
331    } else {
332       struct fd_screen *screen = ctx->screen;
333       util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
334                          create_initial_compute_variants_async, NULL, 0);
335    }
336 
337    return hwcso;
338 }
339 
340 void *
ir3_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)341 ir3_shader_state_create(struct pipe_context *pctx,
342                         const struct pipe_shader_state *cso)
343 {
344    struct fd_context *ctx = fd_context(pctx);
345    struct ir3_compiler *compiler = ctx->screen->compiler;
346    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
347 
348    /*
349     * Convert to nir (if necessary):
350     */
351 
352    nir_shader *nir;
353    if (cso->type == PIPE_SHADER_IR_NIR) {
354       /* we take ownership of the reference: */
355       nir = cso->ir.nir;
356    } else {
357       assert(cso->type == PIPE_SHADER_IR_TGSI);
358       if (ir3_shader_debug & IR3_DBG_DISASM) {
359          tgsi_dump(cso->tokens, 0);
360       }
361       nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
362    }
363 
364    if (ctx->screen->gen >= 6)
365       ir3_nir_lower_io_to_bindless(nir);
366 
367    /*
368     * Create ir3_shader:
369     *
370     * This part is cheap, it doesn't compile initial variants
371     */
372 
373    struct ir3_stream_output_info stream_output = {};
374    copy_stream_out(&stream_output, &cso->stream_output);
375 
376    hwcso->shader =
377       ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
378                               /* TODO: force to single on a6xx with legacy
379                                * ballot extension that uses 64-bit masks
380                                */
381                               .api_wavesize = IR3_SINGLE_OR_DOUBLE,
382                               .real_wavesize = IR3_SINGLE_OR_DOUBLE,
383                           },
384                           &stream_output);
385 
386    /*
387     * Create initial variants to avoid draw-time stalls.  This is
388     * normally done asynchronously, unless debug is enabled (which
389     * will be the case for shader-db)
390     */
391 
392    util_queue_fence_init(&hwcso->ready);
393 
394    if (initial_variants_synchronous(ctx)) {
395       create_initial_variants(hwcso, &ctx->debug);
396    } else {
397       util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
398                          create_initial_variants_async, NULL, 0);
399    }
400 
401    return hwcso;
402 }
403 
404 void
ir3_shader_state_delete(struct pipe_context * pctx,void * _hwcso)405 ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
406 {
407    struct fd_context *ctx = fd_context(pctx);
408    struct fd_screen *screen = ctx->screen;
409    struct ir3_shader_state *hwcso = _hwcso;
410    struct ir3_shader *so = hwcso->shader;
411 
412    ir3_cache_invalidate(ctx->shader_cache, hwcso);
413 
414    /* util_queue_drop_job() guarantees that either:
415     *  1) job did not execute
416     *  2) job completed
417     *
418     * In either case the fence is signaled
419     */
420    util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
421 
422    /* free the uploaded shaders, since this is handled outside of the
423     * shared ir3 code (ie. not used by turnip):
424     */
425    for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
426       fd_bo_del(v->bo);
427       v->bo = NULL;
428 
429       if (v->binning && v->binning->bo) {
430          fd_bo_del(v->binning->bo);
431          v->binning->bo = NULL;
432       }
433    }
434 
435    ir3_shader_destroy(so);
436    util_queue_fence_destroy(&hwcso->ready);
437    free(hwcso);
438 }
439 
440 struct ir3_shader *
ir3_get_shader(struct ir3_shader_state * hwcso)441 ir3_get_shader(struct ir3_shader_state *hwcso)
442 {
443    if (!hwcso)
444       return NULL;
445 
446    MESA_TRACE_FUNC();
447 
448    struct ir3_shader *shader = hwcso->shader;
449    perf_time (1000, "waited for %s:%s:%s variants",
450               _mesa_shader_stage_to_abbrev(shader->type),
451               shader->nir->info.name,
452               shader->nir->info.label) {
453       /* wait for initial variants to compile: */
454       util_queue_fence_wait(&hwcso->ready);
455    }
456 
457    return shader;
458 }
459 
460 struct shader_info *
ir3_get_shader_info(struct ir3_shader_state * hwcso)461 ir3_get_shader_info(struct ir3_shader_state *hwcso)
462 {
463    if (!hwcso)
464       return NULL;
465    return &hwcso->shader->nir->info;
466 }
467 
468 /* fixup dirty shader state in case some "unrelated" (from the state-
469  * tracker's perspective) state change causes us to switch to a
470  * different variant.
471  */
472 void
ir3_fixup_shader_state(struct pipe_context * pctx,struct ir3_shader_key * key)473 ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
474 {
475    struct fd_context *ctx = fd_context(pctx);
476 
477    if (!ir3_shader_key_equal(ctx->last.key, key)) {
478       if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
479          fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
480                                  FD_DIRTY_SHADER_PROG);
481       }
482 
483       if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
484          fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
485       }
486 
487       /* NOTE: currently only a6xx has gs/tess, but needs no
488        * gs/tess specific lowering.
489        */
490 
491       *ctx->last.key = *key;
492    }
493 }
494 
495 static char *
ir3_screen_finalize_nir(struct pipe_screen * pscreen,struct nir_shader * nir)496 ir3_screen_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *nir)
497 {
498    struct fd_screen *screen = fd_screen(pscreen);
499 
500    const struct ir3_shader_nir_options options = {};
501 
502    MESA_TRACE_FUNC();
503 
504    ir3_nir_lower_io_to_temporaries(nir);
505    ir3_finalize_nir(screen->compiler, &options, nir);
506 
507    return NULL;
508 }
509 
510 static void
ir3_set_max_shader_compiler_threads(struct pipe_screen * pscreen,unsigned max_threads)511 ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
512                                     unsigned max_threads)
513 {
514    struct fd_screen *screen = fd_screen(pscreen);
515 
516    /* This function doesn't allow a greater number of threads than
517     * the queue had at its creation.
518     */
519    util_queue_adjust_num_threads(&screen->compile_queue, max_threads,
520                                  false);
521 }
522 
523 static bool
ir3_is_parallel_shader_compilation_finished(struct pipe_screen * pscreen,void * shader,enum pipe_shader_type shader_type)524 ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
525                                             void *shader,
526                                             enum pipe_shader_type shader_type)
527 {
528    struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
529 
530    return util_queue_fence_is_signalled(&hwcso->ready);
531 }
532 
533 void
ir3_prog_init(struct pipe_context * pctx)534 ir3_prog_init(struct pipe_context *pctx)
535 {
536    pctx->create_vs_state = ir3_shader_state_create;
537    pctx->delete_vs_state = ir3_shader_state_delete;
538 
539    pctx->create_tcs_state = ir3_shader_state_create;
540    pctx->delete_tcs_state = ir3_shader_state_delete;
541 
542    pctx->create_tes_state = ir3_shader_state_create;
543    pctx->delete_tes_state = ir3_shader_state_delete;
544 
545    pctx->create_gs_state = ir3_shader_state_create;
546    pctx->delete_gs_state = ir3_shader_state_delete;
547 
548    pctx->create_fs_state = ir3_shader_state_create;
549    pctx->delete_fs_state = ir3_shader_state_delete;
550 }
551 
552 void
ir3_screen_init(struct pipe_screen * pscreen)553 ir3_screen_init(struct pipe_screen *pscreen)
554 {
555    struct fd_screen *screen = fd_screen(pscreen);
556 
557    struct ir3_compiler_options options = {
558       .bindless_fb_read_descriptor =
559          ir3_shader_descriptor_set(PIPE_SHADER_FRAGMENT),
560       .bindless_fb_read_slot = IR3_BINDLESS_IMAGE_OFFSET +
561                                IR3_BINDLESS_IMAGE_COUNT - 1 - screen->max_rts,
562       .dual_color_blend_by_location = screen->driconf.dual_color_blend_by_location,
563    };
564 
565    if (screen->gen >= 6) {
566       options.lower_base_vertex = true;
567    }
568 
569    if (screen->gen >= 7) {
570       options.push_ubo_with_preamble = true;
571    }
572 
573    screen->compiler =
574       ir3_compiler_create(screen->dev, screen->dev_id, screen->info, &options);
575 
576    /* TODO do we want to limit things to # of fast cores, or just limit
577     * based on total # of both big and little cores.  The little cores
578     * tend to be in-order and probably much slower for compiling than
579     * big cores.  OTOH if they are sitting idle, maybe it is useful to
580     * use them?
581     */
582    unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) / 2;
583 
584    /* Create at least one thread - even on single core CPU systems. */
585    num_threads = MAX2(1, num_threads);
586 
587    util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
588                    UTIL_QUEUE_INIT_RESIZE_IF_FULL |
589                       UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
590 
591    pscreen->finalize_nir = ir3_screen_finalize_nir;
592    pscreen->set_max_shader_compiler_threads =
593       ir3_set_max_shader_compiler_threads;
594    pscreen->is_parallel_shader_compilation_finished =
595       ir3_is_parallel_shader_compilation_finished;
596 }
597 
598 void
ir3_screen_fini(struct pipe_screen * pscreen)599 ir3_screen_fini(struct pipe_screen *pscreen)
600 {
601    struct fd_screen *screen = fd_screen(pscreen);
602 
603    util_queue_destroy(&screen->compile_queue);
604    ir3_compiler_destroy(screen->compiler);
605    screen->compiler = NULL;
606 }
607 
608 void
ir3_update_max_tf_vtx(struct fd_context * ctx,const struct ir3_shader_variant * v)609 ir3_update_max_tf_vtx(struct fd_context *ctx,
610                       const struct ir3_shader_variant *v)
611 {
612    struct fd_streamout_stateobj *so = &ctx->streamout;
613    const struct ir3_stream_output_info *info = &v->stream_output;
614    uint32_t maxvtxcnt = 0x7fffffff;
615 
616    if (v->stream_output.num_outputs == 0)
617       maxvtxcnt = 0;
618    if (so->num_targets == 0)
619       maxvtxcnt = 0;
620 
621    /* offset to write to is:
622     *
623     *   total_vtxcnt = vtxcnt + offsets[i]
624     *   offset = total_vtxcnt * stride[i]
625     *
626     *   offset =   vtxcnt * stride[i]       ; calculated in shader
627     *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
628     *
629     * assuming for each vtx, each target buffer will have data written
630     * up to 'offset + stride[i]', that leaves maxvtxcnt as:
631     *
632     *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
633     *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
634     *
635     * but shader is actually doing a less-than (rather than less-than-
636     * equal) check, so we can drop the -stride[i].
637     *
638     * TODO is assumption about `offset + stride[i]` legit?
639     */
640    for (unsigned i = 0; i < so->num_targets; i++) {
641       struct pipe_stream_output_target *target = so->targets[i];
642       unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
643       if (target) {
644          uint32_t max = target->buffer_size / stride;
645          maxvtxcnt = MIN2(maxvtxcnt, max);
646       }
647    }
648 
649    ctx->streamout.max_tf_vtx = maxvtxcnt;
650 }
651 
652 void
ir3_get_private_mem(struct fd_context * ctx,const struct ir3_shader_variant * so)653 ir3_get_private_mem(struct fd_context *ctx, const struct ir3_shader_variant *so)
654 {
655    uint32_t fibers_per_sp = ctx->screen->info->fibers_per_sp;
656    uint32_t num_sp_cores = ctx->screen->info->num_sp_cores;
657 
658    uint32_t per_fiber_size = so->pvtmem_size;
659    if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) {
660       if (ctx->pvtmem[so->pvtmem_per_wave].bo)
661          fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo);
662 
663       uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12);
664       uint32_t total_size = per_sp_size * num_sp_cores;
665 
666       ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size;
667       ctx->pvtmem[so->pvtmem_per_wave].per_sp_size = per_sp_size;
668       ctx->pvtmem[so->pvtmem_per_wave].bo = fd_bo_new(
669          ctx->screen->dev, total_size, FD_BO_NOMAP, "pvtmem_%s_%d",
670          so->pvtmem_per_wave ? "per_wave" : "per_fiber", per_fiber_size);
671    }
672 }
673