• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "pipe/p_screen.h"
28 #include "pipe/p_state.h"
29 #include "tgsi/tgsi_dump.h"
30 #include "util/format/u_format.h"
31 #include "util/u_inlines.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34 
35 #include "nir/tgsi_to_nir.h"
36 #include "nir_serialize.h"
37 
38 #include "freedreno_context.h"
39 #include "freedreno_util.h"
40 
41 #include "ir3/ir3_cache.h"
42 #include "ir3/ir3_compiler.h"
43 #include "ir3/ir3_descriptor.h"
44 #include "ir3/ir3_gallium.h"
45 #include "ir3/ir3_nir.h"
46 #include "ir3/ir3_shader.h"
47 
48 /**
49  * The hardware cso for shader state
50  *
51  * Initially just a container for the ir3_shader, but this is where we'll
52  * plumb in async compile.
53  */
54 struct ir3_shader_state {
55    struct ir3_shader *shader;
56 
57    /* Fence signalled when async compile is completed: */
58    struct util_queue_fence ready;
59 };
60 
61 /**
62  * Should initial variants be compiled synchronously?
63  *
64  * The only case where util_debug_message() is used in the initial-variants
65  * path is with FD_MESA_DEBUG=shaderdb.  So if either debug is disabled (ie.
66  * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
67  * compile the initial shader variant asynchronously.
68  */
69 static bool
initial_variants_synchronous(struct fd_context * ctx)70 initial_variants_synchronous(struct fd_context *ctx)
71 {
72    return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
73           FD_DBG(SERIALC);
74 }
75 
76 static void
dump_shader_info(struct ir3_shader_variant * v,struct util_debug_callback * debug)77 dump_shader_info(struct ir3_shader_variant *v,
78                  struct util_debug_callback *debug)
79 {
80    if (!FD_DBG(SHADERDB))
81       return;
82 
83    util_debug_message(
84       debug, SHADER_INFO,
85       "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
86       "%u dwords, %u last-baryf, %u last-helper, %u half, %u full, %u constlen, "
87       "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
88       "%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
89       "%d loops\n",
90       ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
91       v->info.instrs_count - v->info.nops_count, v->info.mov_count,
92       v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
93       v->info.last_helper, v->info.max_half_reg + 1, v->info.max_reg + 1,
94       v->constlen,
95       v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
96       v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
97       v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
98       v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
99       v->info.stp_count, v->info.ldp_count, v->info.sstall,
100       v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops);
101 }
102 
103 static void
upload_shader_variant(struct ir3_shader_variant * v)104 upload_shader_variant(struct ir3_shader_variant *v)
105 {
106    struct ir3_compiler *compiler = v->compiler;
107 
108    assert(!v->bo);
109 
110    v->bo =
111       fd_bo_new(compiler->dev, v->info.size, FD_BO_NOMAP,
112                 "%s:%s", ir3_shader_stage(v), v->name);
113 
114    /* Always include shaders in kernel crash dumps. */
115    fd_bo_mark_for_dump(v->bo);
116 
117    fd_bo_upload(v->bo, v->bin, 0, v->info.size);
118 }
119 
120 struct ir3_shader_variant *
ir3_shader_variant(struct ir3_shader * shader,struct ir3_shader_key key,bool binning_pass,struct util_debug_callback * debug)121 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
122                    bool binning_pass, struct util_debug_callback *debug)
123 {
124    struct ir3_shader_variant *v;
125    bool created = false;
126 
127    MESA_TRACE_FUNC();
128 
129    /* Some shader key values may not be used by a given ir3_shader (for
130     * example, fragment shader saturates in the vertex shader), so clean out
131     * those flags to avoid recompiling.
132     */
133    ir3_key_clear_unused(&key, shader);
134 
135    v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
136 
137    if (created) {
138       if (shader->initial_variants_done) {
139          perf_debug_message(debug, SHADER_INFO,
140                             "%s shader: recompiling at draw time: global "
141                             "0x%08x, vfsamples %x/%x, astc %x/%x\n",
142                             ir3_shader_stage(v), key.global, key.vsamples,
143                             key.fsamples, key.vastc_srgb, key.fastc_srgb);
144       }
145 
146       dump_shader_info(v, debug);
147       upload_shader_variant(v);
148 
149       if (v->binning) {
150          upload_shader_variant(v->binning);
151          dump_shader_info(v->binning, debug);
152       }
153    }
154 
155    return v;
156 }
157 
158 static void
copy_stream_out(struct ir3_stream_output_info * i,const struct pipe_stream_output_info * p)159 copy_stream_out(struct ir3_stream_output_info *i,
160                 const struct pipe_stream_output_info *p)
161 {
162    STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
163    STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
164 
165    i->streams_written = 0;
166    i->num_outputs = p->num_outputs;
167    for (int n = 0; n < ARRAY_SIZE(i->stride); n++) {
168       i->stride[n] = p->stride[n];
169       if (p->stride[n])
170          i->streams_written |= BIT(n);
171    }
172 
173    for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
174       i->output[n].register_index = p->output[n].register_index;
175       i->output[n].start_component = p->output[n].start_component;
176       i->output[n].num_components = p->output[n].num_components;
177       i->output[n].output_buffer = p->output[n].output_buffer;
178       i->output[n].dst_offset = p->output[n].dst_offset;
179       i->output[n].stream = p->output[n].stream;
180    }
181 }
182 
183 static void
create_initial_variants(struct ir3_shader_state * hwcso,struct util_debug_callback * debug)184 create_initial_variants(struct ir3_shader_state *hwcso,
185                         struct util_debug_callback *debug)
186 {
187    struct ir3_shader *shader = hwcso->shader;
188    struct ir3_compiler *compiler = shader->compiler;
189    nir_shader *nir = shader->nir;
190 
191    /* Compile standard variants immediately to try to avoid draw-time stalls
192     * to run the compiler.
193     */
194    struct ir3_shader_key key = {
195       .tessellation = IR3_TESS_NONE,
196       .ucp_enables = MASK(nir->info.clip_distance_array_size),
197       .msaa = true,
198    };
199 
200    switch (nir->info.stage) {
201    case MESA_SHADER_TESS_EVAL:
202       key.tessellation = ir3_tess_mode(nir->info.tess._primitive_mode);
203       break;
204 
205    case MESA_SHADER_TESS_CTRL:
206       /* The primitive_mode field, while it exists for TCS, is not
207        * populated (since separable shaders between TCS/TES are legal,
208        * so TCS wouldn't have access to TES's declaration).  Make a
209        * guess so that we shader-db something plausible for TCS.
210        */
211       if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
212          key.tessellation = IR3_TESS_TRIANGLES;
213       else
214          key.tessellation = IR3_TESS_ISOLINES;
215       break;
216 
217    case MESA_SHADER_GEOMETRY:
218       key.has_gs = true;
219       break;
220 
221    default:
222       break;
223    }
224 
225    key.safe_constlen = false;
226    struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
227    if (!v)
228       return;
229 
230    if (v->constlen > compiler->max_const_safe) {
231       key.safe_constlen = true;
232       ir3_shader_variant(shader, key, false, debug);
233    }
234 
235    /* For vertex shaders, also compile initial binning pass shader: */
236    if (nir->info.stage == MESA_SHADER_VERTEX) {
237       key.safe_constlen = false;
238       v = ir3_shader_variant(shader, key, true, debug);
239       if (!v)
240          return;
241 
242       if (v->constlen > compiler->max_const_safe) {
243          key.safe_constlen = true;
244          ir3_shader_variant(shader, key, true, debug);
245       }
246    }
247 
248    shader->initial_variants_done = true;
249 }
250 
251 static void
create_initial_variants_async(void * job,void * gdata,int thread_index)252 create_initial_variants_async(void *job, void *gdata, int thread_index)
253 {
254    struct ir3_shader_state *hwcso = job;
255    struct util_debug_callback debug = {};
256 
257    MESA_TRACE_FUNC();
258 
259    create_initial_variants(hwcso, &debug);
260 }
261 
262 static void
create_initial_compute_variants_async(void * job,void * gdata,int thread_index)263 create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
264 {
265    struct ir3_shader_state *hwcso = job;
266    struct ir3_shader *shader = hwcso->shader;
267    struct util_debug_callback debug = {};
268    static struct ir3_shader_key key; /* static is implicitly zeroed */
269 
270    MESA_TRACE_FUNC();
271 
272    ir3_shader_variant(shader, key, false, &debug);
273    shader->initial_variants_done = true;
274 }
275 
276 /* a bit annoying that compute-shader and normal shader state objects
277  * aren't a bit more aligned.
278  */
279 void *
ir3_shader_compute_state_create(struct pipe_context * pctx,const struct pipe_compute_state * cso)280 ir3_shader_compute_state_create(struct pipe_context *pctx,
281                                 const struct pipe_compute_state *cso)
282 {
283    struct fd_context *ctx = fd_context(pctx);
284 
285    /* req_input_mem will only be non-zero for cl kernels (ie. clover).
286     * This isn't a perfect test because I guess it is possible (but
287     * uncommon) for none for the kernel parameters to be a global,
288     * but ctx->set_global_bindings() can't fail, so this is the next
289     * best place to fail if we need a newer version of kernel driver:
290     */
291    if ((cso->req_input_mem > 0) &&
292        fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
293       return NULL;
294    }
295 
296    struct ir3_compiler *compiler = ctx->screen->compiler;
297    nir_shader *nir;
298 
299    if (cso->ir_type == PIPE_SHADER_IR_NIR) {
300       /* we take ownership of the reference: */
301       nir = (nir_shader *)cso->prog;
302    } else if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) {
303       const nir_shader_compiler_options *options =
304             ir3_get_compiler_options(compiler);
305       const struct pipe_binary_program_header *hdr = cso->prog;
306       struct blob_reader reader;
307 
308       blob_reader_init(&reader, hdr->blob, hdr->num_bytes);
309       nir = nir_deserialize(NULL, options, &reader);
310 
311       ir3_finalize_nir(compiler, nir);
312    } else {
313       assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
314       if (ir3_shader_debug & IR3_DBG_DISASM) {
315          tgsi_dump(cso->prog, 0);
316       }
317       nir = tgsi_to_nir(cso->prog, pctx->screen, false);
318    }
319 
320    if (ctx->screen->gen >= 6)
321       ir3_nir_lower_io_to_bindless(nir);
322 
323    enum ir3_wavesize_option api_wavesize = IR3_SINGLE_OR_DOUBLE;
324    enum ir3_wavesize_option real_wavesize = IR3_SINGLE_OR_DOUBLE;
325 
326    if (ctx->screen->gen >= 6 && !ctx->screen->info->a6xx.supports_double_threadsize) {
327       api_wavesize = IR3_SINGLE_ONLY;
328       real_wavesize = IR3_SINGLE_ONLY;
329    }
330 
331    struct ir3_shader *shader =
332       ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
333                               /* TODO: force to single on a6xx with legacy
334                                * ballot extension that uses 64-bit masks
335                                */
336                               .api_wavesize = api_wavesize,
337                               .real_wavesize = real_wavesize,
338                           }, NULL);
339    shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4;     /* byte->dword */
340    shader->cs.req_local_mem = cso->static_shared_mem;
341 
342    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
343 
344    util_queue_fence_init(&hwcso->ready);
345    hwcso->shader = shader;
346 
347    /* Immediately compile a standard variant.  We have so few variants in our
348     * shaders, that doing so almost eliminates draw-time recompiles.  (This
349     * is also how we get data from shader-db's ./run)
350     */
351 
352    if (initial_variants_synchronous(ctx)) {
353       static struct ir3_shader_key key; /* static is implicitly zeroed */
354       ir3_shader_variant(shader, key, false, &ctx->debug);
355       shader->initial_variants_done = true;
356    } else {
357       struct fd_screen *screen = ctx->screen;
358       util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
359                          create_initial_compute_variants_async, NULL, 0);
360    }
361 
362    return hwcso;
363 }
364 
365 void *
ir3_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)366 ir3_shader_state_create(struct pipe_context *pctx,
367                         const struct pipe_shader_state *cso)
368 {
369    struct fd_context *ctx = fd_context(pctx);
370    struct ir3_compiler *compiler = ctx->screen->compiler;
371    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
372 
373    /*
374     * Convert to nir (if necessary):
375     */
376 
377    nir_shader *nir;
378    if (cso->type == PIPE_SHADER_IR_NIR) {
379       /* we take ownership of the reference: */
380       nir = cso->ir.nir;
381    } else {
382       assert(cso->type == PIPE_SHADER_IR_TGSI);
383       if (ir3_shader_debug & IR3_DBG_DISASM) {
384          tgsi_dump(cso->tokens, 0);
385       }
386       nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
387    }
388 
389    if (ctx->screen->gen >= 6)
390       ir3_nir_lower_io_to_bindless(nir);
391 
392    /*
393     * Create ir3_shader:
394     *
395     * This part is cheap, it doesn't compile initial variants
396     */
397 
398    struct ir3_stream_output_info stream_output = {};
399    copy_stream_out(&stream_output, &cso->stream_output);
400 
401    hwcso->shader =
402       ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
403                               /* TODO: force to single on a6xx with legacy
404                                * ballot extension that uses 64-bit masks
405                                */
406                               .api_wavesize = IR3_SINGLE_OR_DOUBLE,
407                               .real_wavesize = IR3_SINGLE_OR_DOUBLE,
408                           },
409                           &stream_output);
410 
411    /*
412     * Create initial variants to avoid draw-time stalls.  This is
413     * normally done asynchronously, unless debug is enabled (which
414     * will be the case for shader-db)
415     */
416 
417    util_queue_fence_init(&hwcso->ready);
418 
419    if (initial_variants_synchronous(ctx)) {
420       create_initial_variants(hwcso, &ctx->debug);
421    } else {
422       util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
423                          create_initial_variants_async, NULL, 0);
424    }
425 
426    return hwcso;
427 }
428 
429 void
ir3_shader_state_delete(struct pipe_context * pctx,void * _hwcso)430 ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
431 {
432    struct fd_context *ctx = fd_context(pctx);
433    struct fd_screen *screen = ctx->screen;
434    struct ir3_shader_state *hwcso = _hwcso;
435    struct ir3_shader *so = hwcso->shader;
436 
437    ir3_cache_invalidate(ctx->shader_cache, hwcso);
438 
439    /* util_queue_drop_job() guarantees that either:
440     *  1) job did not execute
441     *  2) job completed
442     *
443     * In either case the fence is signaled
444     */
445    util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
446 
447    /* free the uploaded shaders, since this is handled outside of the
448     * shared ir3 code (ie. not used by turnip):
449     */
450    for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
451       fd_bo_del(v->bo);
452       v->bo = NULL;
453 
454       if (v->binning && v->binning->bo) {
455          fd_bo_del(v->binning->bo);
456          v->binning->bo = NULL;
457       }
458    }
459 
460    ir3_shader_destroy(so);
461    util_queue_fence_destroy(&hwcso->ready);
462    free(hwcso);
463 }
464 
465 struct ir3_shader *
ir3_get_shader(struct ir3_shader_state * hwcso)466 ir3_get_shader(struct ir3_shader_state *hwcso)
467 {
468    if (!hwcso)
469       return NULL;
470 
471    MESA_TRACE_FUNC();
472 
473    struct ir3_shader *shader = hwcso->shader;
474    perf_time (1000, "waited for %s:%s:%s variants",
475               _mesa_shader_stage_to_abbrev(shader->type),
476               shader->nir->info.name,
477               shader->nir->info.label) {
478       /* wait for initial variants to compile: */
479       util_queue_fence_wait(&hwcso->ready);
480    }
481 
482    return shader;
483 }
484 
485 struct shader_info *
ir3_get_shader_info(struct ir3_shader_state * hwcso)486 ir3_get_shader_info(struct ir3_shader_state *hwcso)
487 {
488    if (!hwcso)
489       return NULL;
490    return &hwcso->shader->nir->info;
491 }
492 
493 /* fixup dirty shader state in case some "unrelated" (from the state-
494  * tracker's perspective) state change causes us to switch to a
495  * different variant.
496  */
497 void
ir3_fixup_shader_state(struct pipe_context * pctx,struct ir3_shader_key * key)498 ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
499 {
500    struct fd_context *ctx = fd_context(pctx);
501 
502    if (!ir3_shader_key_equal(ctx->last.key, key)) {
503       if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
504          fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
505                                  FD_DIRTY_SHADER_PROG);
506       }
507 
508       if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
509          fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
510       }
511 
512       /* NOTE: currently only a6xx has gs/tess, but needs no
513        * gs/tess specific lowering.
514        */
515 
516       *ctx->last.key = *key;
517    }
518 }
519 
520 static char *
ir3_screen_finalize_nir(struct pipe_screen * pscreen,void * nir)521 ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir)
522 {
523    struct fd_screen *screen = fd_screen(pscreen);
524 
525    MESA_TRACE_FUNC();
526 
527    ir3_nir_lower_io_to_temporaries(nir);
528    ir3_finalize_nir(screen->compiler, nir);
529 
530    return NULL;
531 }
532 
533 static void
ir3_set_max_shader_compiler_threads(struct pipe_screen * pscreen,unsigned max_threads)534 ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
535                                     unsigned max_threads)
536 {
537    struct fd_screen *screen = fd_screen(pscreen);
538 
539    /* This function doesn't allow a greater number of threads than
540     * the queue had at its creation.
541     */
542    util_queue_adjust_num_threads(&screen->compile_queue, max_threads,
543                                  false);
544 }
545 
546 static bool
ir3_is_parallel_shader_compilation_finished(struct pipe_screen * pscreen,void * shader,enum pipe_shader_type shader_type)547 ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
548                                             void *shader,
549                                             enum pipe_shader_type shader_type)
550 {
551    struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
552 
553    return util_queue_fence_is_signalled(&hwcso->ready);
554 }
555 
556 void
ir3_prog_init(struct pipe_context * pctx)557 ir3_prog_init(struct pipe_context *pctx)
558 {
559    pctx->create_vs_state = ir3_shader_state_create;
560    pctx->delete_vs_state = ir3_shader_state_delete;
561 
562    pctx->create_tcs_state = ir3_shader_state_create;
563    pctx->delete_tcs_state = ir3_shader_state_delete;
564 
565    pctx->create_tes_state = ir3_shader_state_create;
566    pctx->delete_tes_state = ir3_shader_state_delete;
567 
568    pctx->create_gs_state = ir3_shader_state_create;
569    pctx->delete_gs_state = ir3_shader_state_delete;
570 
571    pctx->create_fs_state = ir3_shader_state_create;
572    pctx->delete_fs_state = ir3_shader_state_delete;
573 }
574 
575 void
ir3_screen_init(struct pipe_screen * pscreen)576 ir3_screen_init(struct pipe_screen *pscreen)
577 {
578    struct fd_screen *screen = fd_screen(pscreen);
579 
580    struct ir3_compiler_options options = {
581       .bindless_fb_read_descriptor =
582          ir3_shader_descriptor_set(PIPE_SHADER_FRAGMENT),
583       .bindless_fb_read_slot = IR3_BINDLESS_IMAGE_OFFSET +
584                                IR3_BINDLESS_IMAGE_COUNT - 1 - screen->max_rts,
585    };
586 
587    if (screen->gen >= 6) {
588       options.lower_base_vertex = true;
589    }
590    screen->compiler =
591       ir3_compiler_create(screen->dev, screen->dev_id, screen->info, &options);
592 
593    /* TODO do we want to limit things to # of fast cores, or just limit
594     * based on total # of both big and little cores.  The little cores
595     * tend to be in-order and probably much slower for compiling than
596     * big cores.  OTOH if they are sitting idle, maybe it is useful to
597     * use them?
598     */
599    unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) / 2;
600 
601    /* Create at least one thread - even on single core CPU systems. */
602    num_threads = MAX2(1, num_threads);
603 
604    util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
605                    UTIL_QUEUE_INIT_RESIZE_IF_FULL |
606                       UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
607 
608    pscreen->finalize_nir = ir3_screen_finalize_nir;
609    pscreen->set_max_shader_compiler_threads =
610       ir3_set_max_shader_compiler_threads;
611    pscreen->is_parallel_shader_compilation_finished =
612       ir3_is_parallel_shader_compilation_finished;
613 }
614 
615 void
ir3_screen_fini(struct pipe_screen * pscreen)616 ir3_screen_fini(struct pipe_screen *pscreen)
617 {
618    struct fd_screen *screen = fd_screen(pscreen);
619 
620    util_queue_destroy(&screen->compile_queue);
621    ir3_compiler_destroy(screen->compiler);
622    screen->compiler = NULL;
623 }
624 
625 void
ir3_update_max_tf_vtx(struct fd_context * ctx,const struct ir3_shader_variant * v)626 ir3_update_max_tf_vtx(struct fd_context *ctx,
627                       const struct ir3_shader_variant *v)
628 {
629    struct fd_streamout_stateobj *so = &ctx->streamout;
630    const struct ir3_stream_output_info *info = &v->stream_output;
631    uint32_t maxvtxcnt = 0x7fffffff;
632 
633    if (v->stream_output.num_outputs == 0)
634       maxvtxcnt = 0;
635    if (so->num_targets == 0)
636       maxvtxcnt = 0;
637 
638    /* offset to write to is:
639     *
640     *   total_vtxcnt = vtxcnt + offsets[i]
641     *   offset = total_vtxcnt * stride[i]
642     *
643     *   offset =   vtxcnt * stride[i]       ; calculated in shader
644     *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
645     *
646     * assuming for each vtx, each target buffer will have data written
647     * up to 'offset + stride[i]', that leaves maxvtxcnt as:
648     *
649     *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
650     *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
651     *
652     * but shader is actually doing a less-than (rather than less-than-
653     * equal) check, so we can drop the -stride[i].
654     *
655     * TODO is assumption about `offset + stride[i]` legit?
656     */
657    for (unsigned i = 0; i < so->num_targets; i++) {
658       struct pipe_stream_output_target *target = so->targets[i];
659       unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
660       if (target) {
661          uint32_t max = target->buffer_size / stride;
662          maxvtxcnt = MIN2(maxvtxcnt, max);
663       }
664    }
665 
666    ctx->streamout.max_tf_vtx = maxvtxcnt;
667 }
668 
669 void
ir3_get_private_mem(struct fd_context * ctx,const struct ir3_shader_variant * so)670 ir3_get_private_mem(struct fd_context *ctx, const struct ir3_shader_variant *so)
671 {
672    uint32_t fibers_per_sp = ctx->screen->info->fibers_per_sp;
673    uint32_t num_sp_cores = ctx->screen->info->num_sp_cores;
674 
675    uint32_t per_fiber_size = so->pvtmem_size;
676    if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) {
677       if (ctx->pvtmem[so->pvtmem_per_wave].bo)
678          fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo);
679 
680       uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12);
681       uint32_t total_size = per_sp_size * num_sp_cores;
682 
683       ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size;
684       ctx->pvtmem[so->pvtmem_per_wave].per_sp_size = per_sp_size;
685       ctx->pvtmem[so->pvtmem_per_wave].bo = fd_bo_new(
686          ctx->screen->dev, total_size, FD_BO_NOMAP, "pvtmem_%s_%d",
687          so->pvtmem_per_wave ? "per_wave" : "per_fiber", per_fiber_size);
688    }
689 }
690