1 /*
2 * Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
3 * Copyright (C) 2019-2022 Collabora, Ltd.
4 * Copyright (C) 2019 Red Hat Inc.
5 * Copyright (C) 2018 Alyssa Rosenzweig
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 *
26 * Authors (Collabora):
27 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
28 *
29 */
30
31 #include "pan_shader.h"
32 #include "nir/tgsi_to_nir.h"
33 #include "util/u_memory.h"
34 #include "util/u_prim.h"
35 #include "nir_builder.h"
36 #include "nir_serialize.h"
37 #include "pan_bo.h"
38 #include "pan_context.h"
39
40 static struct panfrost_uncompiled_shader *
panfrost_alloc_shader(const nir_shader * nir)41 panfrost_alloc_shader(const nir_shader *nir)
42 {
43 struct panfrost_uncompiled_shader *so =
44 rzalloc(NULL, struct panfrost_uncompiled_shader);
45
46 simple_mtx_init(&so->lock, mtx_plain);
47 util_dynarray_init(&so->variants, so);
48
49 so->nir = nir;
50
51 /* Serialize the NIR to a binary blob that we can hash for the disk
52 * cache. Drop unnecessary information (like variable names) so the
53 * serialized NIR is smaller, and also to let us detect more isomorphic
54 * shaders when hashing, increasing cache hits.
55 */
56 struct blob blob;
57 blob_init(&blob);
58 nir_serialize(&blob, nir, true);
59 _mesa_sha1_compute(blob.data, blob.size, so->nir_sha1);
60 blob_finish(&blob);
61
62 return so;
63 }
64
65 static struct panfrost_compiled_shader *
panfrost_alloc_variant(struct panfrost_uncompiled_shader * so)66 panfrost_alloc_variant(struct panfrost_uncompiled_shader *so)
67 {
68 return util_dynarray_grow(&so->variants, struct panfrost_compiled_shader, 1);
69 }
70
71 static void
lower_load_poly_line_smooth_enabled(nir_shader * nir,const struct panfrost_shader_key * key)72 lower_load_poly_line_smooth_enabled(nir_shader *nir,
73 const struct panfrost_shader_key *key)
74 {
75 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
76 nir_builder b = nir_builder_create(impl);
77
78 nir_foreach_block_safe(block, impl) {
79 nir_foreach_instr_safe(instr, block) {
80 if (instr->type != nir_instr_type_intrinsic)
81 continue;
82
83 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
84 if (intrin->intrinsic != nir_intrinsic_load_poly_line_smooth_enabled)
85 continue;
86
87 b.cursor = nir_before_instr(instr);
88 nir_def_rewrite_uses(&intrin->def, nir_imm_true(&b));
89
90 nir_instr_remove(instr);
91 nir_instr_free(instr);
92 }
93 }
94 }
95
96 static void
panfrost_shader_compile(struct panfrost_screen * screen,const nir_shader * ir,struct util_debug_callback * dbg,struct panfrost_shader_key * key,unsigned req_local_mem,unsigned fixed_varying_mask,struct panfrost_shader_binary * out)97 panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
98 struct util_debug_callback *dbg,
99 struct panfrost_shader_key *key, unsigned req_local_mem,
100 unsigned fixed_varying_mask,
101 struct panfrost_shader_binary *out)
102 {
103 struct panfrost_device *dev = pan_device(&screen->base);
104
105 nir_shader *s = nir_shader_clone(NULL, ir);
106
107 /* While graphics shaders are preprocessed at CSO create time, compute
108 * kernels are not preprocessed until they're cloned since the driver does
109 * not get ownership of the NIR from compute CSOs. Do this preprocessing now.
110 * Compute CSOs call this function during create time, so preprocessing
111 * happens at CSO create time regardless.
112 */
113 if (gl_shader_stage_is_compute(s->info.stage))
114 pan_shader_preprocess(s, panfrost_device_gpu_id(dev));
115
116 struct panfrost_compile_inputs inputs = {
117 .debug = dbg,
118 .gpu_id = panfrost_device_gpu_id(dev),
119 };
120
121 /* Lower this early so the backends don't have to worry about it */
122 if (s->info.stage == MESA_SHADER_FRAGMENT) {
123 inputs.fixed_varying_mask = key->fs.fixed_varying_mask;
124 } else if (s->info.stage == MESA_SHADER_VERTEX) {
125 inputs.fixed_varying_mask = fixed_varying_mask;
126
127 /* No IDVS for internal XFB shaders */
128 inputs.no_idvs = s->info.has_transform_feedback_varyings;
129
130 if (s->info.has_transform_feedback_varyings) {
131 NIR_PASS_V(s, nir_io_add_const_offset_to_base,
132 nir_var_shader_in | nir_var_shader_out);
133 NIR_PASS_V(s, nir_io_add_intrinsic_xfb_info);
134 NIR_PASS_V(s, pan_lower_xfb);
135 }
136 }
137
138 util_dynarray_init(&out->binary, NULL);
139
140 if (s->info.stage == MESA_SHADER_FRAGMENT) {
141 if (key->fs.nr_cbufs_for_fragcolor) {
142 NIR_PASS_V(s, panfrost_nir_remove_fragcolor_stores,
143 key->fs.nr_cbufs_for_fragcolor);
144 }
145
146 if (key->fs.sprite_coord_enable) {
147 NIR_PASS_V(s, nir_lower_texcoord_replace_late,
148 key->fs.sprite_coord_enable,
149 true /* point coord is sysval */);
150 }
151
152 if (key->fs.clip_plane_enable) {
153 NIR_PASS_V(s, nir_lower_clip_fs, key->fs.clip_plane_enable, false);
154 }
155
156 if (key->fs.line_smooth) {
157 NIR_PASS_V(s, nir_lower_poly_line_smooth, 16);
158 NIR_PASS_V(s, lower_load_poly_line_smooth_enabled, key);
159 NIR_PASS_V(s, nir_lower_alu);
160 }
161 }
162
163 if (dev->arch <= 5 && s->info.stage == MESA_SHADER_FRAGMENT) {
164 NIR_PASS_V(s, pan_lower_framebuffer, key->fs.rt_formats,
165 pan_raw_format_mask_midgard(key->fs.rt_formats), 0,
166 panfrost_device_gpu_id(dev) < 0x700);
167 }
168
169 NIR_PASS_V(s, panfrost_nir_lower_sysvals, &out->sysvals);
170
171 screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info);
172
173 assert(req_local_mem >= out->info.wls_size);
174 out->info.wls_size = req_local_mem;
175
176 /* In both clone and tgsi_to_nir paths, the shader is ralloc'd against
177 * a NULL context
178 */
179 ralloc_free(s);
180 }
181
182 static void
panfrost_shader_get(struct pipe_screen * pscreen,struct panfrost_pool * shader_pool,struct panfrost_pool * desc_pool,struct panfrost_uncompiled_shader * uncompiled,struct util_debug_callback * dbg,struct panfrost_compiled_shader * state,unsigned req_local_mem)183 panfrost_shader_get(struct pipe_screen *pscreen,
184 struct panfrost_pool *shader_pool,
185 struct panfrost_pool *desc_pool,
186 struct panfrost_uncompiled_shader *uncompiled,
187 struct util_debug_callback *dbg,
188 struct panfrost_compiled_shader *state,
189 unsigned req_local_mem)
190 {
191 struct panfrost_screen *screen = pan_screen(pscreen);
192 struct panfrost_device *dev = pan_device(pscreen);
193
194 struct panfrost_shader_binary res = {0};
195
196 /* Try to retrieve the variant from the disk cache. If that fails,
197 * compile a new variant and store in the disk cache for later reuse.
198 */
199 if (!panfrost_disk_cache_retrieve(screen->disk_cache, uncompiled,
200 &state->key, &res)) {
201 panfrost_shader_compile(screen, uncompiled->nir, dbg, &state->key,
202 req_local_mem, uncompiled->fixed_varying_mask,
203 &res);
204
205 panfrost_disk_cache_store(screen->disk_cache, uncompiled, &state->key,
206 &res);
207 }
208
209 state->info = res.info;
210 state->sysvals = res.sysvals;
211
212 if (res.binary.size) {
213 state->bin = panfrost_pool_take_ref(
214 shader_pool,
215 pan_pool_upload_aligned(&shader_pool->base, res.binary.data,
216 res.binary.size, 128));
217 }
218
219 util_dynarray_fini(&res.binary);
220
221 /* Don't upload RSD for fragment shaders since they need draw-time
222 * merging for e.g. depth/stencil/alpha. RSDs are replaced by simpler
223 * shader program descriptors on Valhall, which can be preuploaded even
224 * for fragment shaders. */
225 bool upload =
226 !(uncompiled->nir->info.stage == MESA_SHADER_FRAGMENT && dev->arch <= 7);
227 screen->vtbl.prepare_shader(state, desc_pool, upload);
228
229 panfrost_analyze_sysvals(state);
230 }
231
232 static void
panfrost_build_key(struct panfrost_context * ctx,struct panfrost_shader_key * key,struct panfrost_uncompiled_shader * uncompiled)233 panfrost_build_key(struct panfrost_context *ctx,
234 struct panfrost_shader_key *key,
235 struct panfrost_uncompiled_shader *uncompiled)
236 {
237 const nir_shader *nir = uncompiled->nir;
238
239 /* We don't currently have vertex shader variants */
240 if (nir->info.stage != MESA_SHADER_FRAGMENT)
241 return;
242
243 struct panfrost_device *dev = pan_device(ctx->base.screen);
244 struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
245 struct pipe_rasterizer_state *rast = (void *)ctx->rasterizer;
246 struct panfrost_uncompiled_shader *vs = ctx->uncompiled[MESA_SHADER_VERTEX];
247
248 /* gl_FragColor lowering needs the number of colour buffers */
249 if (uncompiled->fragcolor_lowered) {
250 key->fs.nr_cbufs_for_fragcolor = fb->nr_cbufs;
251 }
252
253 /* Point sprite lowering needed on Bifrost and newer */
254 if (dev->arch >= 6 && rast && ctx->active_prim == MESA_PRIM_POINTS) {
255 key->fs.sprite_coord_enable = rast->sprite_coord_enable;
256 }
257
258 /* User clip plane lowering needed everywhere */
259 if (rast) {
260 key->fs.clip_plane_enable = rast->clip_plane_enable;
261
262 if (u_reduced_prim(ctx->active_prim) == MESA_PRIM_LINES)
263 key->fs.line_smooth = rast->line_smooth;
264 }
265
266 if (dev->arch <= 5) {
267 u_foreach_bit(i, (nir->info.outputs_read >> FRAG_RESULT_DATA0)) {
268 enum pipe_format fmt = PIPE_FORMAT_R8G8B8A8_UNORM;
269
270 if ((fb->nr_cbufs > i) && fb->cbufs[i])
271 fmt = fb->cbufs[i]->format;
272
273 if (panfrost_blendable_formats_v6[fmt].internal)
274 fmt = PIPE_FORMAT_NONE;
275
276 key->fs.rt_formats[i] = fmt;
277 }
278 }
279
280 /* Funny desktop GL varying lowering on Valhall */
281 if (dev->arch >= 9) {
282 assert(vs != NULL && "too early");
283 key->fs.fixed_varying_mask = vs->fixed_varying_mask;
284 }
285 }
286
287 static struct panfrost_compiled_shader *
panfrost_new_variant_locked(struct panfrost_context * ctx,struct panfrost_uncompiled_shader * uncompiled,struct panfrost_shader_key * key)288 panfrost_new_variant_locked(struct panfrost_context *ctx,
289 struct panfrost_uncompiled_shader *uncompiled,
290 struct panfrost_shader_key *key)
291 {
292 struct panfrost_compiled_shader *prog = panfrost_alloc_variant(uncompiled);
293
294 *prog = (struct panfrost_compiled_shader){
295 .key = *key,
296 .stream_output = uncompiled->stream_output,
297 };
298
299 panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs, uncompiled,
300 &ctx->base.debug, prog, 0);
301
302 prog->earlyzs = pan_earlyzs_analyze(&prog->info);
303
304 return prog;
305 }
306
307 static void
panfrost_bind_shader_state(struct pipe_context * pctx,void * hwcso,enum pipe_shader_type type)308 panfrost_bind_shader_state(struct pipe_context *pctx, void *hwcso,
309 enum pipe_shader_type type)
310 {
311 struct panfrost_context *ctx = pan_context(pctx);
312 ctx->uncompiled[type] = hwcso;
313 ctx->prog[type] = NULL;
314
315 ctx->dirty |= PAN_DIRTY_TLS_SIZE;
316 ctx->dirty_shader[type] |= PAN_DIRTY_STAGE_SHADER;
317
318 if (hwcso)
319 panfrost_update_shader_variant(ctx, type);
320 }
321
322 void
panfrost_update_shader_variant(struct panfrost_context * ctx,enum pipe_shader_type type)323 panfrost_update_shader_variant(struct panfrost_context *ctx,
324 enum pipe_shader_type type)
325 {
326 /* No shader variants for compute */
327 if (type == PIPE_SHADER_COMPUTE)
328 return;
329
330 /* We need linking information, defer this */
331 if (type == PIPE_SHADER_FRAGMENT && !ctx->uncompiled[PIPE_SHADER_VERTEX])
332 return;
333
334 /* Also defer, happens with GALLIUM_HUD */
335 if (!ctx->uncompiled[type])
336 return;
337
338 /* Match the appropriate variant */
339 struct panfrost_uncompiled_shader *uncompiled = ctx->uncompiled[type];
340 struct panfrost_compiled_shader *compiled = NULL;
341
342 simple_mtx_lock(&uncompiled->lock);
343
344 struct panfrost_shader_key key = {0};
345 panfrost_build_key(ctx, &key, uncompiled);
346
347 util_dynarray_foreach(&uncompiled->variants, struct panfrost_compiled_shader,
348 so) {
349 if (memcmp(&key, &so->key, sizeof(key)) == 0) {
350 compiled = so;
351 break;
352 }
353 }
354
355 if (compiled == NULL)
356 compiled = panfrost_new_variant_locked(ctx, uncompiled, &key);
357
358 ctx->prog[type] = compiled;
359
360 simple_mtx_unlock(&uncompiled->lock);
361 }
362
363 static void
panfrost_bind_vs_state(struct pipe_context * pctx,void * hwcso)364 panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso)
365 {
366 panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX);
367
368 /* Fragment shaders are linked with vertex shaders */
369 struct panfrost_context *ctx = pan_context(pctx);
370 panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
371 }
372
373 static void
panfrost_bind_fs_state(struct pipe_context * pctx,void * hwcso)374 panfrost_bind_fs_state(struct pipe_context *pctx, void *hwcso)
375 {
376 panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_FRAGMENT);
377 }
378
379 static void *
panfrost_create_shader_state(struct pipe_context * pctx,const struct pipe_shader_state * cso)380 panfrost_create_shader_state(struct pipe_context *pctx,
381 const struct pipe_shader_state *cso)
382 {
383 nir_shader *nir = (cso->type == PIPE_SHADER_IR_TGSI)
384 ? tgsi_to_nir(cso->tokens, pctx->screen, false)
385 : cso->ir.nir;
386
387 struct panfrost_uncompiled_shader *so = panfrost_alloc_shader(nir);
388
389 /* The driver gets ownership of the nir_shader for graphics. The NIR is
390 * ralloc'd. Free the NIR when we free the uncompiled shader.
391 */
392 ralloc_steal(so, nir);
393
394 so->stream_output = cso->stream_output;
395 so->nir = nir;
396
397 /* Fix linkage early */
398 if (so->nir->info.stage == MESA_SHADER_VERTEX) {
399 so->fixed_varying_mask =
400 (so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
401 ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
402 }
403
404 /* gl_FragColor needs to be lowered before lowering I/O, do that now */
405 if (nir->info.stage == MESA_SHADER_FRAGMENT &&
406 nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) {
407
408 NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
409 so->fragcolor_lowered = true;
410 }
411
412 /* Then run the suite of lowering and optimization, including I/O lowering */
413 struct panfrost_device *dev = pan_device(pctx->screen);
414 pan_shader_preprocess(nir, panfrost_device_gpu_id(dev));
415
416 /* If this shader uses transform feedback, compile the transform
417 * feedback program. This is a special shader variant.
418 */
419 struct panfrost_context *ctx = pan_context(pctx);
420
421 if (so->nir->xfb_info) {
422 nir_shader *xfb = nir_shader_clone(NULL, so->nir);
423 xfb->info.name = ralloc_asprintf(xfb, "%s@xfb", xfb->info.name);
424 xfb->info.internal = true;
425
426 so->xfb = calloc(1, sizeof(struct panfrost_compiled_shader));
427 so->xfb->key.vs_is_xfb = true;
428
429 panfrost_shader_get(ctx->base.screen, &ctx->shaders, &ctx->descs, so,
430 &ctx->base.debug, so->xfb, 0);
431
432 /* Since transform feedback is handled via the transform
433 * feedback program, the original program no longer uses XFB
434 */
435 nir->info.has_transform_feedback_varyings = false;
436 }
437
438 /* Compile the program. We don't use vertex shader keys, so there will
439 * be no further vertex shader variants. We do have fragment shader
440 * keys, but we can still compile with a default key that will work most
441 * of the time.
442 */
443 struct panfrost_shader_key key = {0};
444
445 /* gl_FragColor lowering needs the number of colour buffers on desktop
446 * GL, where it acts as an implicit broadcast to all colour buffers.
447 *
448 * However, gl_FragColor is a legacy feature, so assume that if
449 * gl_FragColor is used, there is only a single render target. The
450 * implicit broadcast is neither especially useful nor required by GLES.
451 */
452 if (so->fragcolor_lowered)
453 key.fs.nr_cbufs_for_fragcolor = 1;
454
455 /* Creating a CSO is single-threaded, so it's ok to use the
456 * locked function without explicitly taking the lock. Creating a
457 * default variant acts as a precompile.
458 */
459 panfrost_new_variant_locked(ctx, so, &key);
460
461 return so;
462 }
463
464 static void
panfrost_delete_shader_state(struct pipe_context * pctx,void * so)465 panfrost_delete_shader_state(struct pipe_context *pctx, void *so)
466 {
467 struct panfrost_uncompiled_shader *cso =
468 (struct panfrost_uncompiled_shader *)so;
469
470 util_dynarray_foreach(&cso->variants, struct panfrost_compiled_shader, so) {
471 panfrost_bo_unreference(so->bin.bo);
472 panfrost_bo_unreference(so->state.bo);
473 panfrost_bo_unreference(so->linkage.bo);
474 }
475
476 if (cso->xfb) {
477 panfrost_bo_unreference(cso->xfb->bin.bo);
478 panfrost_bo_unreference(cso->xfb->state.bo);
479 panfrost_bo_unreference(cso->xfb->linkage.bo);
480 free(cso->xfb);
481 }
482
483 simple_mtx_destroy(&cso->lock);
484
485 ralloc_free(so);
486 }
487
488 /*
489 * Create a compute CSO. As compute kernels do not require variants, they are
490 * precompiled, creating both the uncompiled and compiled shaders now.
491 */
492 static void *
panfrost_create_compute_state(struct pipe_context * pctx,const struct pipe_compute_state * cso)493 panfrost_create_compute_state(struct pipe_context *pctx,
494 const struct pipe_compute_state *cso)
495 {
496 struct panfrost_context *ctx = pan_context(pctx);
497 struct panfrost_uncompiled_shader *so = panfrost_alloc_shader(cso->prog);
498 struct panfrost_compiled_shader *v = panfrost_alloc_variant(so);
499 memset(v, 0, sizeof *v);
500
501 assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported");
502
503 panfrost_shader_get(pctx->screen, &ctx->shaders, &ctx->descs, so,
504 &ctx->base.debug, v, cso->static_shared_mem);
505
506 /* The NIR becomes invalid after this. For compute kernels, we never
507 * need to access it again. Don't keep a dangling pointer around.
508 */
509 ralloc_free((void *)so->nir);
510 so->nir = NULL;
511
512 return so;
513 }
514
515 static void
panfrost_bind_compute_state(struct pipe_context * pipe,void * cso)516 panfrost_bind_compute_state(struct pipe_context *pipe, void *cso)
517 {
518 struct panfrost_context *ctx = pan_context(pipe);
519 struct panfrost_uncompiled_shader *uncompiled = cso;
520
521 ctx->uncompiled[PIPE_SHADER_COMPUTE] = uncompiled;
522
523 ctx->prog[PIPE_SHADER_COMPUTE] =
524 uncompiled ? util_dynarray_begin(&uncompiled->variants) : NULL;
525 }
526
527 static void
panfrost_get_compute_state_info(struct pipe_context * pipe,void * cso,struct pipe_compute_state_object_info * info)528 panfrost_get_compute_state_info(struct pipe_context *pipe, void *cso,
529 struct pipe_compute_state_object_info *info)
530 {
531 struct panfrost_device *dev = pan_device(pipe->screen);
532 struct panfrost_uncompiled_shader *uncompiled = cso;
533 struct panfrost_compiled_shader *cs =
534 util_dynarray_begin(&uncompiled->variants);
535
536 info->max_threads =
537 panfrost_max_thread_count(dev->arch, cs->info.work_reg_count);
538 info->private_memory = cs->info.tls_size;
539 info->simd_sizes = pan_subgroup_size(dev->arch);
540 info->preferred_simd_size = info->simd_sizes;
541 }
542
543 void
panfrost_shader_context_init(struct pipe_context * pctx)544 panfrost_shader_context_init(struct pipe_context *pctx)
545 {
546 pctx->create_vs_state = panfrost_create_shader_state;
547 pctx->delete_vs_state = panfrost_delete_shader_state;
548 pctx->bind_vs_state = panfrost_bind_vs_state;
549
550 pctx->create_fs_state = panfrost_create_shader_state;
551 pctx->delete_fs_state = panfrost_delete_shader_state;
552 pctx->bind_fs_state = panfrost_bind_fs_state;
553
554 pctx->create_compute_state = panfrost_create_compute_state;
555 pctx->bind_compute_state = panfrost_bind_compute_state;
556 pctx->get_compute_state_info = panfrost_get_compute_state_info;
557 pctx->delete_compute_state = panfrost_delete_shader_state;
558 }
559