• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #pragma once
7 
8 #include <ctype.h>
9 #include "nir.h"
10 #include "nir_builder.h"
11 #include "nir_serialize.h"
12 
13 /*
14  * This file contains helpers for precompiling OpenCL kernels with a Mesa driver
15  * and dispatching them from within the driver. It is a grab bag of utility
16  * functions, rather than an all-in-one solution, to give drivers flexibility to
17  * customize the compile pipeline. See asahi_clc for how the pieces fit
18  * together, and see libagx for real world examples of this infrastructure.
19  *
20  * Why OpenCL C?
21  *
22  * 1. Mesa drivers are generally written in C. OpenCL C is close enough to C11
23  *    that we can share driver code between host and device. This is the "killer
24  *    feature" and enables implementing device-generated commands in a sane way.
25  *    Both generated (e.g. GenXML) headers and entire complex driver logic may
26  *    be shared for a major maintenance win.
27  *
28  * 2. OpenCL C has significant better ergonomics than GLSL, particularly around
29  *    raw pointers. Plainly, GLSL was never designed as a systems language. What
30  *    we need for implementing driver features on-device is a systems language,
31  *    not a shading language.
32  *
33  * 3. OpenCL is the compute standard, and it is supported in Mesa via rusticl.
34  *    Using OpenCL in our drivers is a way of "eating our own dog food". If Mesa
35  *    based OpenCL isn't good enough for us, it's not good enough for our users
36  *    either.
37  *
38  * 4. OpenCL C has enough affordances for GPUs that it is suitable for GPU use,
39  *    unlike pure C11.
40  *
41  * Why precompile?
42  *
43  * 1. Precompiling lets us do build-time reflection on internal shaders to
44  *    generate data layouts and dispatch macros automatically. The precompile
45  *    pipeline implemented in this file offers significantly better ergonomics
46  *    than handrolling kernels at runtime.
47  *
48  * 2. Compiling internal shaders at draw-time can introduce jank. Compiling
49  *    internal shaders with application shaders slows down application shader
50  *    compile time (and might still introduce jank in a hash-and-cache scheme).
51  *    Compiling shaders at device creation time slows down initialization. The
52  *    only time we can compile with no performance impact is when building the
53  *    driver ahead-of-time.
54  *
55  * 3. Mesa is built (on developer and packager machines) far less often than it
56  *    is run (on user machines). Compiling at build-time is simply more
57  *    efficient in a global sense.
58  *
59  * 4. Compiling /all/ internal shaders with the Mesa build can turn runtime
60  *    assertion fails into build failures, allowing for backend compilers to be
61  *    smoke-tested without hardware testing and hence allowing regressions to be
62  *    caught sooner.
63  *
64  * At a high level, a library of kernels is compiled to SPIR-V. That SPIR-V is
65  * then translated to NIR and optimized, leaving many entrypoints. Each NIR
66  * entrypoint represents one `kernel` to be precompiled.
67  *
68  * Kernels generally have arguments. Arguments may be either scalars or
69  * pointers. It is not necessary to explicitly define a data layout for the
70  * arguments. You simply declare arguments to the OpenCL side kernel:
71  *
72  *    KERNEL(1) void foo(int x, int y) { .. }
73  *
74  * The data layout is automatically derived from the function signature
75  * (nir_precomp_derive_layout). The data layout is exposed to the CPU as
76  * structures (nir_precomp_print_layout_struct).
77  *
78  *    struct foo_args {
79  *       uint32_t x;
80  *       uint32_t y;
81  *    } PACKED;
82  *
83  * The data is expected to be mapped to something like Vulkan push constants in
84  * the hardware. The driver defines a callback to load an argument given a byte
85  * offset (e.g. via load_push_constant intrinsics). When building a variant,
86  * nir_precomp_build_variant will load the arguments according to the chosen
87  * layout:
88  *
89  *    %0 = load_push_constant 0
90  *    %1 = load_push_constant 4
91  *    ...
92  *
93  * This ensures that data layouts match between CPU and GPU, without any
94  * boilerplate, while giving drivers control over exactly how arguments are
95  * passed. (This can save an indirection compared to stuffing in a UBO.)
96  *
97  * To dispatch kernels from the driver, the kernel is "called" like a function:
98  *
99  *    foo(cmdbuf, grid(4, 4, 1), x, y);
100  *
101  * This resolves to generated dispatch macros
102  * (nir_precomp_print_dispatch_macros), which lay out their arguments according
103  * to the derived layout and then call the driver-specific dispatch. To
104  * implement that mechanism, a driver must implement the following function
105  * signature:
106  *
107  *    MESA_DISPATCH_PRECOMP(context, grid, barrier, kernel index,
108  *                          argument pointer, size of arguments)
109  *
110  * The exact types used are determined by the driver. context is something like
111  * a Vulkan command buffer. grid represents the 3D dispatch size. barrier
112  * describes the synchronization and cache flushing required before and after
113  * the dispatch. kernel index is the index of the precompiled kernel
114  * (nir_precomp_index). argument pointer is a host pointer to the sized argument
115  * structure, which the driver must upload and bind (e.g. as push constants).
116  *
117  * Because the types are ambiguous here, the same mechanism works for both
118  * Gallium and Vulkan drivers.
119  *
120  * Although the generated header could be consumed by OpenCL code,
121  * MESA_DISPATCH_PRECOMP is not intended to be implemented on the device side.
122  * Instead, an analogous mechanism can be implemented for device-side enqueue
123  * with automatic data layout handling. Device-side enqueue of precompiled
124  * kernels has various applications, most obviously for implementing
125  * device-generated commands.
126  *
127  * All precompiled kernels for a given target are zero-indexed and referenced in
128  * an array of binaries. These indices are enum values, generated by
129  * nir_precomp_print_program_enum. The array of kernels is generated by
130  * nir_precomp_print_binary_map. There is generally an array for each hardware
131  * target supported by a driver. On device creation, the driver would select the
132  * array of binaries for the probed hardware.
133  *
134  * Sometimes a single binary can be used for multiple targets. In this case, the
135  * driver should compile it only once and remap the binary arrays with the
136  * callback passed to nir_precomp_print_binary_map.
137  *
138  * A single entrypoint may have multiple variants, as a small shader key. To
139  * support this, kernel parameters suffixed with __n will automatically vary
140  * from 0 to n - 1. This mechanism is controlled by
141  * nir_precomp_parse_variant_param. For example:
142  *
143  *    KERNEL(1) void bar(uchar *x, int variant__4) {
144  *       for (uint i = 0; i <= variant__4; ++i)
145  *          x[i]++;
146  *    }
147  *
148  * will generate 4 binaries with 1, 2, 3, and 4 additions respectively. This
149  * mechanism (sigil suffixing) is kinda ugly, but I can't figure out a nicer way
150  * to attach metadata to the argument in standard OpenCL.
151  *
152  * Internally, all variants of a given kernel have a flat index. The bijection
153  * between n variant parameters and 1 flat index is given in the
154  * nir_precomp_decode_variant_index comment.
155  *
156  * Kernels must declare their workgroup size with
157  * __attribute__((reqd_work_group_size(...))) for two reasons. First, variable
158  * workgroup sizes have tricky register allocation problems in several backends,
159  * avoided here. Second, it makes more sense to attach the workgroup size to the
160  * kernel than to the caller so this improves ergonomics of the dispatch macros.
161  */
162 
163 #define NIR_PRECOMP_MAX_ARGS (64)
164 
165 struct nir_precomp_opts {
166    /* If nonzero, minimum (power-of-two) alignment required for kernel
167     * arguments. Kernel arguments will be naturally aligned regardless, but this
168     * models a minimum alignment required by some hardware.
169     */
170    unsigned arg_align_B;
171 };
172 
173 struct nir_precomp_layout {
174    unsigned size_B;
175    unsigned offset_B[NIR_PRECOMP_MAX_ARGS];
176    bool prepadded[NIR_PRECOMP_MAX_ARGS];
177 };
178 
179 static inline unsigned
nir_precomp_parse_variant_param(const nir_function * f,unsigned p)180 nir_precomp_parse_variant_param(const nir_function *f, unsigned p)
181 {
182    assert(p < f->num_params);
183 
184    const char *token = "__";
185    const char *q = strstr(f->params[p].name, token);
186    if (q == NULL)
187       return 0;
188 
189    int n = atoi(q + strlen(token));
190 
191    /* Ensure the number is something reasonable */
192    assert(n > 1 && n < 32 && "sanity check");
193    return n;
194 }
195 
196 static inline bool
nir_precomp_is_variant_param(const nir_function * f,unsigned p)197 nir_precomp_is_variant_param(const nir_function *f, unsigned p)
198 {
199    return nir_precomp_parse_variant_param(f, p) != 0;
200 }
201 
202 #define nir_precomp_foreach_arg(f, p)           \
203    for (unsigned p = 0; p < f->num_params; ++p) \
204       if (!nir_precomp_is_variant_param(f, p))
205 
206 #define nir_precomp_foreach_variant_param(f, p) \
207    for (unsigned p = 0; p < f->num_params; ++p) \
208       if (nir_precomp_is_variant_param(f, p))
209 
210 static inline unsigned
nir_precomp_nr_variants(const nir_function * f)211 nir_precomp_nr_variants(const nir_function *f)
212 {
213    unsigned nr = 1;
214 
215    nir_precomp_foreach_variant_param(f, p) {
216       nr *= nir_precomp_parse_variant_param(f, p);
217    }
218 
219    return nr;
220 }
221 
222 static inline bool
nir_precomp_has_variants(const nir_function * f)223 nir_precomp_has_variants(const nir_function *f)
224 {
225    return nir_precomp_nr_variants(f) > 1;
226 }
227 
228 static inline struct nir_precomp_layout
nir_precomp_derive_layout(const struct nir_precomp_opts * opt,const nir_function * f)229 nir_precomp_derive_layout(const struct nir_precomp_opts *opt,
230                           const nir_function *f)
231 {
232    struct nir_precomp_layout l = { 0 };
233 
234    nir_precomp_foreach_arg(f, a) {
235       nir_parameter param = f->params[a];
236       assert(a < ARRAY_SIZE(l.offset_B));
237 
238       /* Align members naturally */
239       l.offset_B[a] = ALIGN_POT(l.size_B, param.bit_size / 8);
240 
241       /* Align arguments to driver minimum */
242       if (opt->arg_align_B) {
243          l.offset_B[a] = ALIGN_POT(l.offset_B[a], opt->arg_align_B);
244       }
245 
246       l.prepadded[a] = (l.offset_B[a] != l.size_B);
247       l.size_B = l.offset_B[a] + (param.num_components * param.bit_size) / 8;
248    }
249 
250    return l;
251 }
252 
253 static inline unsigned
nir_precomp_index(const nir_shader * lib,const nir_function * func)254 nir_precomp_index(const nir_shader *lib, const nir_function *func)
255 {
256    unsigned i = 0;
257 
258    nir_foreach_entrypoint(candidate, lib) {
259       if (candidate == func)
260          return i;
261 
262       i += nir_precomp_nr_variants(candidate);
263    }
264 
265    unreachable("function must be in library");
266 }
267 
268 static inline void
nir_print_uppercase(FILE * fp,const char * str)269 nir_print_uppercase(FILE *fp, const char *str)
270 {
271    for (unsigned i = 0; i < strlen(str); ++i) {
272       fputc(toupper(str[i]), fp);
273    }
274 }
275 
276 static inline void
nir_precomp_print_enum_value(FILE * fp,const nir_function * func)277 nir_precomp_print_enum_value(FILE *fp, const nir_function *func)
278 {
279    nir_print_uppercase(fp, func->name);
280 }
281 
282 static inline void
nir_precomp_print_enum_variant_value(FILE * fp,const nir_function * func,unsigned v)283 nir_precomp_print_enum_variant_value(FILE *fp, const nir_function *func, unsigned v)
284 {
285    nir_precomp_print_enum_value(fp, func);
286 
287    if (nir_precomp_has_variants(func)) {
288       fprintf(fp, "_%u", v);
289    } else {
290       assert(v == 0);
291    }
292 }
293 
294 static inline void
nir_precomp_print_variant_params(FILE * fp,nir_function * func,bool with_types)295 nir_precomp_print_variant_params(FILE *fp, nir_function *func, bool with_types)
296 {
297    if (nir_precomp_has_variants(func)) {
298       fprintf(fp, "(");
299 
300       bool first = true;
301       nir_precomp_foreach_variant_param(func, p) {
302          fprintf(fp, "%s%s%s", first ? "" : ", ", with_types ? "unsigned " : "",
303                  func->params[p].name);
304          first = false;
305       }
306 
307       fprintf(fp, ")");
308    }
309 }
310 
311 /*
312  * Given a flattened 1D index, extract the i'th coordinate of the original N-D
313  * vector. The forward map is:
314  *
315  *    I = sum(t=1...n) [x_t product(j=1...(t-1)) [k_j]]
316  *
317  * It can be shown that
318  *
319  *    I < product_(j=1...n)[k_j]
320  *
321  *    x_i = floor(I / product(j=1...(i-1)) [k_j]) mod k_i
322  *
323  * The inequality is by induction on n. The equivalence follows from the
324  * inequality by splitting the sum of I at t=i, showing the smaller terms get
325  * killed by the floor and the higher terms get killed by the modulus leaving
326  * just x_i.
327  *
328  * The forward map is emitted in nir_precomp_print_program_enum. The inverse is
329  * calculated here.
330  */
331 static inline unsigned
nir_precomp_decode_variant_index(const nir_function * func,unsigned I,unsigned i)332 nir_precomp_decode_variant_index(const nir_function *func, unsigned I,
333                                  unsigned i)
334 {
335    unsigned product = 1;
336 
337    nir_precomp_foreach_variant_param(func, j) {
338       if (j >= i)
339          break;
340 
341       unsigned k_j = nir_precomp_parse_variant_param(func, j);
342       product *= k_j;
343    }
344 
345    unsigned k_i = nir_precomp_parse_variant_param(func, i);
346    return (I / product) % k_i;
347 }
348 
349 static inline void
nir_precomp_print_program_enum(FILE * fp,const nir_shader * lib,const char * prefix)350 nir_precomp_print_program_enum(FILE *fp, const nir_shader *lib, const char *prefix)
351 {
352    /* Generate an enum indexing all binaries */
353    fprintf(fp, "enum %s_program {\n", prefix);
354    nir_foreach_entrypoint(func, lib) {
355       unsigned index = nir_precomp_index(lib, func);
356 
357       for (unsigned v = 0; v < nir_precomp_nr_variants(func); ++v) {
358          fprintf(fp, "    ");
359          nir_precomp_print_enum_variant_value(fp, func, v);
360          fprintf(fp, " = %u,\n", index + v);
361       }
362    }
363    fprintf(fp, "    ");
364    nir_print_uppercase(fp, prefix);
365    fprintf(fp, "_NUM_PROGRAMS,\n");
366    fprintf(fp, "};\n\n");
367 
368    /* Generate indexing variants */
369    nir_foreach_entrypoint(func, lib) {
370       if (nir_precomp_has_variants(func)) {
371          fprintf(fp, "static inline unsigned\n");
372          nir_precomp_print_enum_value(fp, func);
373          nir_precomp_print_variant_params(fp, func, true);
374          fprintf(fp, "\n");
375          fprintf(fp, "{\n");
376 
377          nir_precomp_foreach_variant_param(func, p) {
378             /* Assert indices are in bounds. These provides some safety. */
379             fprintf(fp, "   assert(%s < %u);\n", func->params[p].name,
380                     nir_precomp_parse_variant_param(func, p));
381          }
382 
383          /* Flatten an N-D index into a 1D index using the standard mapping.
384           *
385           * We iterate parameters backwards so we can do a single multiply-add
386           * each step for simplicity (similar to Horner's method).
387           */
388          fprintf(fp, "\n");
389          bool first = true;
390          for (signed p = func->num_params - 1; p >= 0; --p) {
391             if (!nir_precomp_is_variant_param(func, p))
392                continue;
393 
394             if (first) {
395                fprintf(fp, "   unsigned idx = %s;\n", func->params[p].name);
396             } else {
397                fprintf(fp, "   idx = (idx * %u) + %s;\n",
398                        nir_precomp_parse_variant_param(func, p),
399                        func->params[p].name);
400             }
401 
402             first = false;
403          }
404 
405          /* Post-condition: flattened index is in bounds. */
406          fprintf(fp, "\n");
407          fprintf(fp, "   assert(idx < %u);\n", nir_precomp_nr_variants(func));
408 
409          fprintf(fp, "   return ");
410          nir_precomp_print_enum_variant_value(fp, func, 0);
411          fprintf(fp, " + idx;\n");
412          fprintf(fp, "}\n\n");
413       }
414    }
415    fprintf(fp, "\n");
416 }
417 
418 static inline void
nir_precomp_print_layout_struct(FILE * fp,const struct nir_precomp_opts * opt,const nir_function * func)419 nir_precomp_print_layout_struct(FILE *fp, const struct nir_precomp_opts *opt,
420                                 const nir_function *func)
421 {
422    struct nir_precomp_layout layout = nir_precomp_derive_layout(opt, func);
423 
424    /* Generate a C struct matching the data layout we chose. This is how
425     * the CPU will pack arguments.
426     */
427    unsigned offset_B = 0;
428 
429    fprintf(fp, "struct %s_args {\n", func->name);
430    nir_precomp_foreach_arg(func, a) {
431       nir_parameter param = func->params[a];
432       assert(param.name != NULL && "kernel args must be named");
433 
434       assert(layout.offset_B[a] >= offset_B);
435       unsigned pad = layout.offset_B[a] - offset_B;
436       assert((pad > 0) == layout.prepadded[a]);
437 
438       if (pad > 0) {
439          fprintf(fp, "   uint8_t _pad%u[%u];\n", a, pad);
440          offset_B += pad;
441       }
442 
443       /* After padding, the layout will match. */
444       assert(layout.offset_B[a] == offset_B);
445 
446       fprintf(fp, "   uint%u_t %s", param.bit_size, param.name);
447       if (param.num_components > 1) {
448          fprintf(fp, "[%u]", param.num_components);
449       }
450       fprintf(fp, ";\n");
451 
452       offset_B += param.num_components * (param.bit_size / 8);
453    }
454    fprintf(fp, "} PACKED;\n\n");
455 
456    /* Assert that the layout on the CPU matches the layout on the GPU. Because
457     * of the asserts above, these are mostly just sanity checking the compiler.
458     * But better err on the side of defensive because alignment bugs are REALLY
459     * painful to track down and we don't pay by the static assert.
460     */
461    nir_precomp_foreach_arg(func, a) {
462       nir_parameter param = func->params[a];
463 
464       fprintf(fp, "static_assert(offsetof(struct %s_args, %s) == %u, \"\");\n",
465               func->name, param.name, layout.offset_B[a]);
466    }
467    fprintf(fp, "static_assert(sizeof(struct %s_args) == %u, \"\");\n",
468            func->name, layout.size_B);
469 
470    fprintf(fp, "\n");
471 }
472 
473 static inline void
nir_precomp_print_dispatch_macros(FILE * fp,const struct nir_precomp_opts * opt,const nir_shader * nir)474 nir_precomp_print_dispatch_macros(FILE *fp, const struct nir_precomp_opts *opt,
475                                   const nir_shader *nir)
476 {
477    nir_foreach_entrypoint(func, nir) {
478       struct nir_precomp_layout layout = nir_precomp_derive_layout(opt, func);
479 
480       for (unsigned i = 0; i < 2; ++i) {
481          bool is_struct = i == 0;
482 
483          fprintf(fp, "#define %s%s(_context, _grid, _barrier%s", func->name,
484                  is_struct ? "_struct" : "", is_struct ? ", _data" : "");
485 
486          /* Add the arguments, including variant parameters. For struct macros,
487           * we include only the variant parameters; the kernel arguments are
488           * taken from the struct.
489           */
490          for (unsigned p = 0; p < func->num_params; ++p) {
491             if (!is_struct || nir_precomp_is_variant_param(func, p))
492                fprintf(fp, ", %s", func->params[p].name);
493          }
494 
495          fprintf(fp, ") do { \\\n");
496 
497          fprintf(fp, "   struct %s_args _args = ", func->name);
498 
499          if (is_struct) {
500             fprintf(fp, "_data");
501          } else {
502             fprintf(fp, "{");
503 
504             nir_precomp_foreach_arg(func, a) {
505                /* We need to zero out the padding between members. We cannot use
506                 * a designated initializer without prefixing the macro
507                 * arguments, which would add noise to the macro signature
508                 * reported in IDEs (which should ideally match the actual
509                 * signature as close as possible).
510                 */
511                if (layout.prepadded[a]) {
512                   assert(a > 0 && "first argument is never prepadded");
513                   fprintf(fp, ", {0}");
514                }
515 
516                fprintf(fp, "%s%s", a == 0 ? "" : ", ", func->params[a].name);
517             }
518 
519             fprintf(fp, "}");
520          }
521 
522          fprintf(fp, ";\\\n");
523 
524          /* Dispatch via MESA_DISPATCH_PRECOMP, which the driver must #define
525           * suitably before #include-ing this file.
526           */
527          fprintf(fp, "   MESA_DISPATCH_PRECOMP(_context, _grid, _barrier, ");
528          nir_precomp_print_enum_value(fp, func);
529          nir_precomp_print_variant_params(fp, func, false);
530          fprintf(fp, ", &_args, sizeof(_args)); \\\n");
531          fprintf(fp, "} while(0);\n\n");
532       }
533    }
534    fprintf(fp, "\n");
535 }
536 
537 static inline void
nir_precomp_print_extern_binary_map(FILE * fp,const char * prefix,const char * target)538 nir_precomp_print_extern_binary_map(FILE *fp,
539                                     const char *prefix, const char *target)
540 {
541    fprintf(fp, "extern const uint32_t *%s_%s[", prefix, target);
542    nir_print_uppercase(fp, prefix);
543    fprintf(fp, "_NUM_PROGRAMS];\n");
544 }
545 
546 static inline void
nir_precomp_print_binary_map(FILE * fp,const nir_shader * nir,const char * prefix,const char * target,const char * (* map)(nir_function * func,unsigned variant,const char * target))547 nir_precomp_print_binary_map(FILE *fp, const nir_shader *nir,
548                              const char *prefix, const char *target,
549                              const char *(*map)(nir_function *func,
550                                                 unsigned variant,
551                                                 const char *target))
552 {
553    fprintf(fp, "const uint32_t *%s_%s[", prefix, target);
554    nir_print_uppercase(fp, prefix);
555    fprintf(fp, "_NUM_PROGRAMS] = {\n");
556 
557    nir_foreach_entrypoint(func, nir) {
558       for (unsigned v = 0; v < nir_precomp_nr_variants(func); ++v) {
559          fprintf(fp, "    [");
560          nir_precomp_print_enum_variant_value(fp, func, v);
561          fprintf(fp, "] = %s_%u_%s,\n", func->name, v,
562                  map ? map(func, v, target) : target);
563       }
564    }
565 
566    fprintf(fp, "};\n\n");
567 }
568 
569 static inline nir_shader *
nir_precompiled_build_variant(const nir_function * libfunc,unsigned variant,const nir_shader_compiler_options * opts,const struct nir_precomp_opts * precomp_opt,nir_def * (* load_arg)(nir_builder * b,unsigned num_components,unsigned bit_size,unsigned offset_B))570 nir_precompiled_build_variant(const nir_function *libfunc, unsigned variant,
571                               const nir_shader_compiler_options *opts,
572                               const struct nir_precomp_opts *precomp_opt,
573                               nir_def *(*load_arg)(nir_builder *b,
574                                                    unsigned num_components,
575                                                    unsigned bit_size,
576                                                    unsigned offset_B))
577 {
578    bool has_variants = nir_precomp_has_variants(libfunc);
579    struct nir_precomp_layout layout =
580       nir_precomp_derive_layout(precomp_opt, libfunc);
581 
582    nir_builder b;
583    if (has_variants) {
584       b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, opts,
585                                          "%s variant %u", libfunc->name,
586                                          variant);
587    } else {
588       b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, opts, "%s",
589                                          libfunc->name);
590    }
591 
592    assert(libfunc->workgroup_size[0] != 0 && "must set workgroup size");
593 
594    b.shader->info.workgroup_size[0] = libfunc->workgroup_size[0];
595    b.shader->info.workgroup_size[1] = libfunc->workgroup_size[1];
596    b.shader->info.workgroup_size[2] = libfunc->workgroup_size[2];
597 
598    nir_function *func = nir_function_clone(b.shader, libfunc);
599    func->is_entrypoint = false;
600 
601    nir_def *args[NIR_PRECOMP_MAX_ARGS] = { NULL };
602 
603    /* Some parameters are variant indices and others are kernel arguments */
604    for (unsigned a = 0; a < libfunc->num_params; ++a) {
605       nir_parameter p = func->params[a];
606 
607       if (nir_precomp_is_variant_param(libfunc, a)) {
608          unsigned idx = nir_precomp_decode_variant_index(libfunc, variant, a);
609          args[a] = nir_imm_intN_t(&b, idx, p.bit_size);
610       } else {
611          args[a] = load_arg(&b, p.num_components, p.bit_size, layout.offset_B[a]);
612       }
613    }
614 
615    nir_build_call(&b, func, func->num_params, args);
616    return b.shader;
617 }
618 
619 static inline void
nir_precomp_print_blob(FILE * fp,const char * arr_name,const char * suffix,uint32_t variant,const uint32_t * data,size_t len,bool is_static)620 nir_precomp_print_blob(FILE *fp, const char *arr_name, const char *suffix,
621                        uint32_t variant, const uint32_t *data, size_t len, bool is_static)
622 {
623    fprintf(fp, "%sconst uint32_t %s_%u_%s[%zu] = {", is_static ? "static " : "", arr_name, variant, suffix,
624            DIV_ROUND_UP(len, 4));
625    for (unsigned i = 0; i < (len / 4); i++) {
626       if (i % 4 == 0)
627          fprintf(fp, "\n   ");
628 
629       fprintf(fp, " 0x%08" PRIx32 ",", data[i]);
630    }
631 
632    if (len % 4) {
633       const uint8_t *data_u8 = (const uint8_t *)data;
634       uint32_t last = 0;
635       unsigned last_offs = ROUND_DOWN_TO(len, 4);
636       for (unsigned i = 0; i < len % 4; ++i) {
637          last |= (uint32_t)data_u8[last_offs + i] << (i * 8);
638       }
639 
640       fprintf(fp, " 0x%08" PRIx32 ",", last);
641    }
642 
643    fprintf(fp, "\n};\n");
644 }
645 
646 static inline void
nir_precomp_print_nir(FILE * fp_c,FILE * fp_h,const nir_shader * nir,const char * name,const char * suffix)647 nir_precomp_print_nir(FILE *fp_c, FILE *fp_h, const nir_shader *nir,
648                       const char *name, const char *suffix)
649 {
650    struct blob blob;
651    blob_init(&blob);
652    nir_serialize(&blob, nir, true /* strip */);
653 
654    nir_precomp_print_blob(fp_c, name, suffix, 0, (const uint32_t *)blob.data,
655                           blob.size, false);
656 
657    fprintf(fp_h, "extern const uint32_t %s_0_%s[%zu];\n", name, suffix,
658            DIV_ROUND_UP(blob.size, 4));
659 
660    blob_finish(&blob);
661 }
662 
663 static inline void
nir_precomp_print_header(FILE * fp_c,FILE * fp_h,const char * copyright,const char * h_name)664 nir_precomp_print_header(FILE *fp_c, FILE *fp_h, const char *copyright,
665                          const char *h_name)
666 {
667    for (unsigned i = 0; i < 2; ++i) {
668       FILE *fp = i ? fp_c : fp_h;
669       fprintf(fp, "/*\n");
670       fprintf(fp, " * Copyright %s\n", copyright);
671       fprintf(fp, " * SPDX-License-Identifier: MIT\n");
672       fprintf(fp, " *\n");
673       fprintf(fp, " * Autogenerated file, do not edit\n");
674       fprintf(fp, " */\n\n");
675 
676       /* uint32_t types are used throughout */
677       fprintf(fp, "#include <stdint.h>\n\n");
678    }
679 
680    /* The generated C code depends on the header we will generate */
681    fprintf(fp_c, "#include \"%s\"\n", h_name);
682 
683    /* Include guard the header. This relies on a grown up compiler. If you're
684     * doing precompiled, you have one.
685     */
686    fprintf(fp_h, "#pragma once\n");
687 
688    /* The generated header uses unprefixed static_assert which needs an #include
689     * seemingly.
690     */
691    fprintf(fp_h, "#include \"util/macros.h\"\n\n");
692 }
693