1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #ifndef IR3_SHADER_H_
28 #define IR3_SHADER_H_
29
30 #include <stdio.h>
31
32 #include "c11/threads.h"
33 #include "compiler/nir/nir.h"
34 #include "compiler/shader_enums.h"
35 #include "util/bitscan.h"
36 #include "util/disk_cache.h"
37
38 #include "ir3_compiler.h"
39
40 BEGINC;
41
42 /* driver param indices: */
43 enum ir3_driver_param {
44 /* compute shader driver params: */
45 IR3_DP_NUM_WORK_GROUPS_X = 0,
46 IR3_DP_NUM_WORK_GROUPS_Y = 1,
47 IR3_DP_NUM_WORK_GROUPS_Z = 2,
48 IR3_DP_WORK_DIM = 3,
49 IR3_DP_BASE_GROUP_X = 4,
50 IR3_DP_BASE_GROUP_Y = 5,
51 IR3_DP_BASE_GROUP_Z = 6,
52 IR3_DP_CS_SUBGROUP_SIZE = 7,
53 IR3_DP_LOCAL_GROUP_SIZE_X = 8,
54 IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
55 IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
56 IR3_DP_SUBGROUP_ID_SHIFT = 11,
57 IR3_DP_WORKGROUP_ID_X = 12,
58 IR3_DP_WORKGROUP_ID_Y = 13,
59 IR3_DP_WORKGROUP_ID_Z = 14,
60 /* NOTE: gl_NumWorkGroups should be vec4 aligned because
61 * glDispatchComputeIndirect() needs to load these from
62 * the info->indirect buffer. Keep that in mind when/if
63 * adding any addition CS driver params.
64 */
65 IR3_DP_CS_COUNT = 16, /* must be aligned to vec4 */
66
67 /* vertex shader driver params: */
68 IR3_DP_DRAWID = 0,
69 IR3_DP_VTXID_BASE = 1,
70 IR3_DP_INSTID_BASE = 2,
71 IR3_DP_VTXCNT_MAX = 3,
72 IR3_DP_IS_INDEXED_DRAW = 4, /* Note: boolean, ie. 0 or ~0 */
73 /* user-clip-plane components, up to 8x vec4's: */
74 IR3_DP_UCP0_X = 5,
75 /* .... */
76 IR3_DP_UCP7_W = 36,
77 IR3_DP_VS_COUNT = 40, /* must be aligned to vec4 */
78
79 /* TCS driver params: */
80 IR3_DP_HS_DEFAULT_OUTER_LEVEL_X = 0,
81 IR3_DP_HS_DEFAULT_OUTER_LEVEL_Y = 1,
82 IR3_DP_HS_DEFAULT_OUTER_LEVEL_Z = 2,
83 IR3_DP_HS_DEFAULT_OUTER_LEVEL_W = 3,
84 IR3_DP_HS_DEFAULT_INNER_LEVEL_X = 4,
85 IR3_DP_HS_DEFAULT_INNER_LEVEL_Y = 5,
86 IR3_DP_HS_COUNT = 8, /* must be aligned to vec4 */
87
88 /* fragment shader driver params: */
89 IR3_DP_FS_SUBGROUP_SIZE = 0,
90 /* Dynamic params (that aren't known when compiling the shader) */
91 IR3_DP_FS_DYNAMIC = 4,
92 IR3_DP_FS_FRAG_INVOCATION_COUNT = IR3_DP_FS_DYNAMIC,
93 IR3_DP_FS_FRAG_SIZE = IR3_DP_FS_DYNAMIC + 4,
94 IR3_DP_FS_FRAG_OFFSET = IR3_DP_FS_DYNAMIC + 6,
95 };
96
97 #define IR3_MAX_SHADER_BUFFERS 32
98 #define IR3_MAX_SHADER_IMAGES 32
99 #define IR3_MAX_SO_BUFFERS 4
100 #define IR3_MAX_SO_STREAMS 4
101 #define IR3_MAX_SO_OUTPUTS 128
102 #define IR3_MAX_UBO_PUSH_RANGES 32
103
104 /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
105 enum ir3_bary {
106 IJ_PERSP_PIXEL,
107 IJ_PERSP_SAMPLE,
108 IJ_PERSP_CENTROID,
109 IJ_PERSP_CENTER_RHW,
110 IJ_LINEAR_PIXEL,
111 IJ_LINEAR_CENTROID,
112 IJ_LINEAR_SAMPLE,
113 IJ_COUNT,
114 };
115
116 /* Description of what wavesizes are allowed. */
117 enum ir3_wavesize_option {
118 IR3_SINGLE_ONLY,
119 IR3_SINGLE_OR_DOUBLE,
120 IR3_DOUBLE_ONLY,
121 };
122
123 /**
124 * Description of a lowered UBO.
125 */
126 struct nir_def;
127
128 struct ir3_ubo_info {
129 struct nir_def *global_base; /* For global loads, the base address */
130 uint32_t block; /* Which constant block */
131 uint16_t bindless_base; /* For bindless, which base register is used */
132 bool bindless;
133 bool global;
134 };
135
136 /**
137 * Description of a range of a lowered UBO access.
138 *
139 * Drivers should not assume that there are not multiple disjoint
140 * lowered ranges of a single UBO.
141 */
142 struct ir3_ubo_range {
143 struct ir3_ubo_info ubo;
144 uint32_t offset; /* start offset to push in the const register file */
145 uint32_t start, end; /* range of block that's actually used */
146 };
147
148 struct ir3_ubo_analysis_state {
149 struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
150 uint32_t num_enabled;
151 uint32_t size;
152 };
153
154 enum ir3_push_consts_type {
155 IR3_PUSH_CONSTS_NONE,
156 IR3_PUSH_CONSTS_PER_STAGE,
157 IR3_PUSH_CONSTS_SHARED,
158 IR3_PUSH_CONSTS_SHARED_PREAMBLE,
159 };
160
161 /* This represents an internal UBO filled out by the driver. There are a few
162 * common UBOs that must be filled out identically by all drivers, for example
163 * for shader linkage, but drivers can also add their own that they manage
164 * themselves.
165 */
166 struct ir3_driver_ubo {
167 int32_t idx;
168 uint32_t size;
169 };
170
171 /**
172 * Describes the layout of shader consts in the const register file.
173 *
174 * Layout of constant registers, each section aligned to vec4. Note
175 * that pointer size (ubo, etc) changes depending on generation.
176 *
177 * + user consts: only used for turnip push consts
178 * + lowered UBO ranges
179 * + preamble consts
180 * + UBO addresses: turnip is bindless and these are wasted
181 * + image dimensions: a5xx only; needed to calculate pixel offset, but only
182 * for images that have image_{load,store,size,atomic*} intrinsics
183 * + kernel params: cl only
184 * + driver params: these are stage-dependent; see ir3_driver_param
185 * + TFBO addresses: only for vs on a3xx/a4xx
186 * + primitive params: these are stage-dependent
187 * vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
188 * hs, ds: uvec4(primitive_stride, vertex_stride,
189 * patch_stride, patch_vertices_in)
190 * uvec4(tess_param_base, tess_factor_base)
191 * + primitive map
192 * + lowered immediates
193 *
194 * Immediates go last mostly because they are inserted in the CP pass
195 * after the nir -> ir3 frontend.
196 *
197 * Note UBO size in bytes should be aligned to vec4
198 */
199 struct ir3_const_state {
200 unsigned num_ubos;
201 unsigned num_driver_params; /* scalar */
202
203 struct ir3_driver_ubo consts_ubo;
204 struct ir3_driver_ubo driver_params_ubo;
205 struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
206
207 int32_t constant_data_dynamic_offsets;
208
209 struct {
210 /* user const start at zero */
211 unsigned ubo;
212 unsigned image_dims;
213 unsigned kernel_params;
214 unsigned driver_param;
215 unsigned tfbo;
216 unsigned primitive_param;
217 unsigned primitive_map;
218 unsigned immediate;
219 } offsets;
220
221 struct {
222 uint32_t mask; /* bitmask of images that have image_store */
223 uint32_t count; /* number of consts allocated */
224 /* three const allocated per image which has image_store:
225 * + cpp (bytes per pixel)
226 * + pitch (y pitch)
227 * + array_pitch (z pitch)
228 */
229 uint32_t off[IR3_MAX_SHADER_IMAGES];
230 } image_dims;
231
232 unsigned immediates_count;
233 unsigned immediates_size;
234 uint32_t *immediates;
235
236 unsigned preamble_size;
237 unsigned global_size;
238
239 /* State of ubo access lowered to push consts: */
240 struct ir3_ubo_analysis_state ubo_state;
241 enum ir3_push_consts_type push_consts_type;
242 };
243
244 /**
245 * A single output for vertex transform feedback.
246 */
247 struct ir3_stream_output {
248 unsigned register_index : 6; /**< 0 to 63 (OUT index) */
249 unsigned start_component : 2; /** 0 to 3 */
250 unsigned num_components : 3; /** 1 to 4 */
251 unsigned output_buffer : 3; /**< 0 to PIPE_MAX_SO_BUFFERS */
252 unsigned dst_offset : 16; /**< offset into the buffer in dwords */
253 unsigned stream : 2; /**< 0 to 3 */
254 };
255
256 /**
257 * Stream output for vertex transform feedback.
258 */
259 struct ir3_stream_output_info {
260 unsigned num_outputs;
261 /** stride for an entire vertex for each buffer in dwords */
262 uint16_t stride[IR3_MAX_SO_BUFFERS];
263
264 /* These correspond to the VPC_SO_STREAM_CNTL fields */
265 uint8_t streams_written;
266 uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
267
268 /**
269 * Array of stream outputs, in the order they are to be written in.
270 * Selected components are tightly packed into the output buffer.
271 */
272 struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
273 };
274
275 /**
276 * Starting from a4xx, HW supports pre-dispatching texture sampling
277 * instructions prior to scheduling a shader stage, when the
278 * coordinate maps exactly to an output of the previous stage.
279 */
280
281 /**
282 * There is a limit in the number of pre-dispatches allowed for any
283 * given stage.
284 */
285 #define IR3_MAX_SAMPLER_PREFETCH 4
286
287 /**
288 * This is the output stream value for 'cmd', as used by blob. It may
289 * encode the return type (in 3 bits) but it hasn't been verified yet.
290 */
291 #define IR3_SAMPLER_PREFETCH_CMD 0x4
292 #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
293
294 /**
295 * Stream output for texture sampling pre-dispatches.
296 */
297 struct ir3_sampler_prefetch {
298 uint8_t src;
299 bool bindless;
300 uint8_t samp_id;
301 uint8_t tex_id;
302 uint16_t samp_bindless_id;
303 uint16_t tex_bindless_id;
304 uint8_t dst;
305 uint8_t wrmask;
306 uint8_t half_precision;
307 opc_t tex_opc;
308 };
309
310 /* Configuration key used to identify a shader variant.. different
311 * shader variants can be used to implement features not supported
312 * in hw (two sided color), binning-pass vertex shader, etc.
313 *
314 * When adding to this struct, please update ir3_shader_variant()'s debug
315 * output.
316 */
317 struct ir3_shader_key {
318 union {
319 struct {
320 /*
321 * Combined Vertex/Fragment shader parameters:
322 */
323 unsigned ucp_enables : 8;
324
325 /* do we need to check {v,f}saturate_{s,t,r}? */
326 unsigned has_per_samp : 1;
327
328 /*
329 * Fragment shader variant parameters:
330 */
331 unsigned sample_shading : 1;
332 unsigned msaa : 1;
333 /* used when shader needs to handle flat varyings (a4xx)
334 * for front/back color inputs to frag shader:
335 */
336 unsigned rasterflat : 1;
337
338 /* Indicates that this is a tessellation pipeline which requires a
339 * whole different kind of vertex shader. In case of
340 * tessellation, this field also tells us which kind of output
341 * topology the TES uses, which the TCS needs to know.
342 */
343 #define IR3_TESS_NONE 0
344 #define IR3_TESS_QUADS 1
345 #define IR3_TESS_TRIANGLES 2
346 #define IR3_TESS_ISOLINES 3
347 unsigned tessellation : 2;
348
349 unsigned has_gs : 1;
350
351 /* Whether stages after TCS read gl_PrimitiveID, used to determine
352 * whether the TCS has to store it in the tess factor BO.
353 */
354 unsigned tcs_store_primid : 1;
355
356 /* Whether this variant sticks to the "safe" maximum constlen,
357 * which guarantees that the combined stages will never go over
358 * the limit:
359 */
360 unsigned safe_constlen : 1;
361 };
362 uint32_t global;
363 };
364
365 /* bitmask of ms shifts (a3xx) */
366 uint32_t vsamples, fsamples;
367
368 /* bitmask of samplers which need astc srgb workaround (a4xx): */
369 uint16_t vastc_srgb, fastc_srgb;
370
371 /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */
372 uint16_t vsampler_swizzles[16];
373 uint16_t fsampler_swizzles[16];
374 };
375
376 static inline unsigned
ir3_tess_mode(enum tess_primitive_mode tess_mode)377 ir3_tess_mode(enum tess_primitive_mode tess_mode)
378 {
379 switch (tess_mode) {
380 case TESS_PRIMITIVE_ISOLINES:
381 return IR3_TESS_ISOLINES;
382 case TESS_PRIMITIVE_TRIANGLES:
383 return IR3_TESS_TRIANGLES;
384 case TESS_PRIMITIVE_QUADS:
385 return IR3_TESS_QUADS;
386 default:
387 unreachable("bad tessmode");
388 }
389 }
390
391 static inline uint32_t
ir3_tess_factor_stride(unsigned patch_type)392 ir3_tess_factor_stride(unsigned patch_type)
393 {
394 /* note: this matches the stride used by ir3's build_tessfactor_base */
395 switch (patch_type) {
396 case IR3_TESS_ISOLINES:
397 return 12;
398 case IR3_TESS_TRIANGLES:
399 return 20;
400 case IR3_TESS_QUADS:
401 return 28;
402 default:
403 unreachable("bad tessmode");
404 }
405 }
406
407 static inline bool
ir3_shader_key_equal(const struct ir3_shader_key * a,const struct ir3_shader_key * b)408 ir3_shader_key_equal(const struct ir3_shader_key *a,
409 const struct ir3_shader_key *b)
410 {
411 /* slow-path if we need to check {v,f}saturate_{s,t,r} */
412 if (a->has_per_samp || b->has_per_samp)
413 return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
414 return a->global == b->global;
415 }
416
417 /* will the two keys produce different lowering for a fragment shader? */
418 static inline bool
ir3_shader_key_changes_fs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)419 ir3_shader_key_changes_fs(struct ir3_shader_key *key,
420 struct ir3_shader_key *last_key)
421 {
422 if (last_key->has_per_samp || key->has_per_samp) {
423 if ((last_key->fsamples != key->fsamples) ||
424 (last_key->fastc_srgb != key->fastc_srgb) ||
425 memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles,
426 sizeof(key->fsampler_swizzles)))
427 return true;
428 }
429
430 if (last_key->rasterflat != key->rasterflat)
431 return true;
432
433 if (last_key->ucp_enables != key->ucp_enables)
434 return true;
435
436 if (last_key->safe_constlen != key->safe_constlen)
437 return true;
438
439 return false;
440 }
441
442 /* will the two keys produce different lowering for a vertex shader? */
443 static inline bool
ir3_shader_key_changes_vs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)444 ir3_shader_key_changes_vs(struct ir3_shader_key *key,
445 struct ir3_shader_key *last_key)
446 {
447 if (last_key->has_per_samp || key->has_per_samp) {
448 if ((last_key->vsamples != key->vsamples) ||
449 (last_key->vastc_srgb != key->vastc_srgb) ||
450 memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles,
451 sizeof(key->vsampler_swizzles)))
452 return true;
453 }
454
455 if (last_key->ucp_enables != key->ucp_enables)
456 return true;
457
458 if (last_key->safe_constlen != key->safe_constlen)
459 return true;
460
461 return false;
462 }
463
464 /**
465 * On a4xx+a5xx, Images share state with textures and SSBOs:
466 *
467 * + Uses texture (cat5) state/instruction (isam) to read
468 * + Uses SSBO state and instructions (cat6) to write and for atomics
469 *
470 * Starting with a6xx, Images and SSBOs are basically the same thing,
471 * with texture state and isam also used for SSBO reads.
472 *
473 * On top of that, gallium makes the SSBO (shader_buffers) state semi
474 * sparse, with the first half of the state space used for atomic
475 * counters lowered to atomic buffers. We could ignore this, but I
476 * don't think we could *really* handle the case of a single shader
477 * that used the max # of textures + images + SSBOs. And once we are
478 * offsetting images by num_ssbos (or visa versa) to map them into
479 * the same hardware state, the hardware state has become coupled to
480 * the shader state, so at this point we might as well just use a
481 * mapping table to remap things from image/SSBO idx to hw idx.
482 *
483 * To make things less (more?) confusing, for the hw "SSBO" state
484 * (since it is really both SSBO and Image) I'll use the name "IBO"
485 */
486 struct ir3_ibo_mapping {
487 #define IBO_INVALID 0xff
488 /* Maps logical SSBO state to hw tex state: */
489 uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
490
491 /* Maps logical Image state to hw tex state: */
492 uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
493
494 /* Maps hw state back to logical SSBO or Image state:
495 *
496 * note IBO_SSBO ORd into values to indicate that the
497 * hw slot is used for SSBO state vs Image state.
498 */
499 #define IBO_SSBO 0x80
500 uint8_t tex_to_image[32];
501
502 /* including real textures */
503 uint8_t num_tex;
504 /* the number of real textures, ie. image/ssbo start here */
505 uint8_t tex_base;
506 };
507
508 struct ir3_disasm_info {
509 bool write_disasm;
510 char *nir;
511 char *disasm;
512 };
513
514 /* Represents half register in regid */
515 #define HALF_REG_ID 0x100
516
517 struct ir3_shader_options {
518 unsigned num_reserved_user_consts;
519 /* What API-visible wavesizes are allowed. Even if only double wavesize is
520 * allowed, we may still use the smaller wavesize "under the hood" and the
521 * application simply sees the upper half as always disabled.
522 */
523 enum ir3_wavesize_option api_wavesize;
524 /* What wavesizes we're allowed to actually use. If the API wavesize is
525 * single-only, then this must be single-only too.
526 */
527 enum ir3_wavesize_option real_wavesize;
528 enum ir3_push_consts_type push_consts_type;
529
530 uint32_t push_consts_base;
531 uint32_t push_consts_dwords;
532 };
533
534 /**
535 * Shader variant which contains the actual hw shader instructions,
536 * and necessary info for shader state setup.
537 */
538 struct ir3_shader_variant {
539 struct fd_bo *bo;
540
541 /* variant id (for debug) */
542 uint32_t id;
543
544 /* id of the shader the variant came from (for debug) */
545 uint32_t shader_id;
546
547 struct ir3_shader_key key;
548
549 /* vertex shaders can have an extra version for hwbinning pass,
550 * which is pointed to by so->binning:
551 */
552 bool binning_pass;
553 // union {
554 struct ir3_shader_variant *binning;
555 struct ir3_shader_variant *nonbinning;
556 // };
557
558 struct ir3 *ir; /* freed after assembling machine instructions */
559
560 /* shader variants form a linked list: */
561 struct ir3_shader_variant *next;
562
563 /* replicated here to avoid passing extra ptrs everywhere: */
564 gl_shader_stage type;
565 struct ir3_compiler *compiler;
566
567 char *name;
568
569 /* variant's copy of nir->constant_data (since we don't track the NIR in
570 * the variant, and shader->nir is before the opt pass). Moves to v->bin
571 * after assembly.
572 */
573 void *constant_data;
574
575 struct ir3_disasm_info disasm_info;
576
577 /*
578 * Below here is serialized when written to disk cache:
579 */
580
581 /* The actual binary shader instructions, size given by info.sizedwords: */
582 uint32_t *bin;
583
584 struct ir3_const_state *const_state;
585
586 /*
587 * The following macros are used by the shader disk cache save/
588 * restore paths to serialize/deserialize the variant. Any
589 * pointers that require special handling in store_variant()
590 * and retrieve_variant() should go above here.
591 */
592 #define VARIANT_CACHE_START offsetof(struct ir3_shader_variant, info)
593 #define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
594 #define VARIANT_CACHE_SIZE \
595 (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
596
597 struct ir3_info info;
598
599 struct ir3_shader_options shader_options;
600
601 uint32_t constant_data_size;
602
603 /* Levels of nesting of flow control:
604 */
605 unsigned branchstack;
606
607 unsigned loops;
608
609 /* the instructions length is in units of instruction groups
610 * (4 instructions for a3xx, 16 instructions for a4xx.. each
611 * instruction is 2 dwords):
612 */
613 unsigned instrlen;
614
615 /* the constants length is in units of vec4's, and is the sum of
616 * the uniforms and the built-in compiler constants
617 */
618 unsigned constlen;
619
620 /* The private memory size in bytes per fiber */
621 unsigned pvtmem_size;
622 /* Whether we should use the new per-wave layout rather than per-fiber. */
623 bool pvtmem_per_wave;
624
625 /* Whether multi-position output is enabled. */
626 bool multi_pos_output;
627
628 /* Whether dual-source blending is enabled. */
629 bool dual_src_blend;
630
631 /* Size in bytes of required shared memory */
632 unsigned shared_size;
633
634 /* About Linkage:
635 * + Let the frag shader determine the position/compmask for the
636 * varyings, since it is the place where we know if the varying
637 * is actually used, and if so, which components are used. So
638 * what the hw calls "outloc" is taken from the "inloc" of the
639 * frag shader.
640 * + From the vert shader, we only need the output regid
641 */
642
643 bool frag_face, color0_mrt;
644 uint8_t fragcoord_compmask;
645
646 /* NOTE: for input/outputs, slot is:
647 * gl_vert_attrib - for VS inputs
648 * gl_varying_slot - for VS output / FS input
649 * gl_frag_result - for FS output
650 */
651
652 /* varyings/outputs: */
653 unsigned outputs_count;
654 struct {
655 uint8_t slot;
656 uint8_t regid;
657 uint8_t view;
658 bool half : 1;
659 } outputs[32 + 2]; /* +POSITION +PSIZE */
660 bool writes_pos, writes_smask, writes_psize, writes_viewport, writes_stencilref;
661
662 /* Size in dwords of all outputs for VS, size of entire patch for HS. */
663 uint32_t output_size;
664
665 /* Expected size of incoming output_loc for HS, DS, and GS */
666 uint32_t input_size;
667
668 /* Map from location to offset in per-primitive storage. In dwords for
669 * HS, where varyings are read in the next stage via ldg with a dword
670 * offset, and in bytes for all other stages.
671 * +POSITION, +PSIZE, ... - see shader_io_get_unique_index
672 */
673 unsigned output_loc[12 + 32];
674
675 /* attributes (VS) / varyings (FS):
676 * Note that sysval's should come *after* normal inputs.
677 */
678 unsigned inputs_count;
679 struct {
680 uint8_t slot;
681 uint8_t regid;
682 uint8_t compmask;
683 /* location of input (ie. offset passed to bary.f, etc). This
684 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
685 * have the OUTLOCn value offset by 8, presumably to account
686 * for gl_Position/gl_PointSize)
687 */
688 uint8_t inloc;
689 /* vertex shader specific: */
690 bool sysval : 1; /* slot is a gl_system_value */
691 /* fragment shader specific: */
692 bool bary : 1; /* fetched varying (vs one loaded into reg) */
693 bool rasterflat : 1; /* special handling for emit->rasterflat */
694 bool half : 1;
695 bool flat : 1;
696 } inputs[32 + 2]; /* +POSITION +FACE */
697 bool reads_primid;
698
699 /* sum of input components (scalar). For frag shaders, it only counts
700 * the varying inputs:
701 */
702 unsigned total_in;
703
704 /* sum of sysval input components (scalar). */
705 unsigned sysval_in;
706
707 /* For frag shaders, the total number of inputs (not scalar,
708 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
709 */
710 unsigned varying_in;
711
712 /* Remapping table to map Image and SSBO to hw state: */
713 struct ir3_ibo_mapping image_mapping;
714
715 /* number of samplers/textures (which are currently 1:1): */
716 int num_samp;
717
718 /* is there an implicit sampler to read framebuffer (FS only).. if
719 * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
720 * the last "real" texture)
721 */
722 bool fb_read;
723
724 /* do we have one or more SSBO instructions: */
725 bool has_ssbo;
726
727 /* Which bindless resources are used, for filling out sp_xs_config */
728 bool bindless_tex;
729 bool bindless_samp;
730 bool bindless_ibo;
731 bool bindless_ubo;
732
733 /* do we need derivatives: */
734 bool need_pixlod;
735
736 bool need_full_quad;
737
738 /* do we need VS driver params? */
739 bool need_driver_params;
740
741 /* do we have image write, etc (which prevents early-z): */
742 bool no_earlyz;
743
744 /* do we have kill, which also prevents early-z, but not necessarily
745 * early-lrz (as long as lrz-write is disabled, which must be handled
746 * outside of ir3. Unlike other no_earlyz cases, kill doesn't have
747 * side effects that prevent early-lrz discard.
748 */
749 bool has_kill;
750
751 bool per_samp;
752
753 bool post_depth_coverage;
754
755 /* Are we using split or merged register file? */
756 bool mergedregs;
757
758 uint8_t clip_mask, cull_mask;
759
760 /* for astc srgb workaround, the number/base of additional
761 * alpha tex states we need, and index of original tex states
762 */
763 struct {
764 unsigned base, count;
765 unsigned orig_idx[16];
766 } astc_srgb;
767
768 /* for tg4 workaround, the number/base of additional
769 * unswizzled tex states we need, and index of original tex states
770 */
771 struct {
772 unsigned base, count;
773 unsigned orig_idx[16];
774 } tg4;
775
776 /* texture sampler pre-dispatches */
777 uint32_t num_sampler_prefetch;
778 struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
779
780 /* If true, the last use of helper invocations is the texture prefetch and
781 * they should be disabled for the actual shader. Equivalent to adding
782 * (eq)nop at the beginning of the shader.
783 */
784 bool prefetch_end_of_quad;
785
786 uint16_t local_size[3];
787 bool local_size_variable;
788
789 /* Important for compute shader to determine max reg footprint */
790 bool has_barrier;
791
792 /* The offset where images start in the IBO array. */
793 unsigned num_ssbos;
794
795 /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */
796 unsigned num_ibos;
797
798 union {
799 struct {
800 enum tess_primitive_mode primitive_mode;
801
802 /** The number of vertices in the TCS output patch. */
803 uint8_t tcs_vertices_out;
804 enum gl_tess_spacing spacing:2; /*gl_tess_spacing*/
805
806 /** Is the vertex order counterclockwise? */
807 bool ccw:1;
808 bool point_mode:1;
809 } tess;
810 struct {
811 /** The output primitive type */
812 uint16_t output_primitive;
813
814 /** The maximum number of vertices the geometry shader might write. */
815 uint16_t vertices_out;
816
817 /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
818 uint8_t invocations;
819
820 /** The number of vertices received per input primitive (max. 6) */
821 uint8_t vertices_in:3;
822 } gs;
823 struct {
824 bool early_fragment_tests : 1;
825 bool color_is_dual_source : 1;
826 bool uses_fbfetch_output : 1;
827 bool fbfetch_coherent : 1;
828 } fs;
829 struct {
830 unsigned req_input_mem;
831 unsigned req_local_mem;
832 } cs;
833 };
834
835 /* For when we don't have a shader, variant's copy of streamout state */
836 struct ir3_stream_output_info stream_output;
837 };
838
839 static inline const char *
ir3_shader_stage(struct ir3_shader_variant * v)840 ir3_shader_stage(struct ir3_shader_variant *v)
841 {
842 switch (v->type) {
843 case MESA_SHADER_VERTEX:
844 return v->binning_pass ? "BVERT" : "VERT";
845 case MESA_SHADER_TESS_CTRL:
846 return "TCS";
847 case MESA_SHADER_TESS_EVAL:
848 return "TES";
849 case MESA_SHADER_GEOMETRY:
850 return "GEOM";
851 case MESA_SHADER_FRAGMENT:
852 return "FRAG";
853 case MESA_SHADER_COMPUTE:
854 case MESA_SHADER_KERNEL:
855 return "CL";
856 default:
857 unreachable("invalid type");
858 return NULL;
859 }
860 }
861
862 /* Currently we do not do binning for tess. And for GS there is no
863 * cross-stage VS+GS optimization, so the full VS+GS is used in
864 * the binning pass.
865 */
866 static inline bool
ir3_has_binning_vs(const struct ir3_shader_key * key)867 ir3_has_binning_vs(const struct ir3_shader_key *key)
868 {
869 if (key->tessellation || key->has_gs)
870 return false;
871 return true;
872 }
873
874 /**
875 * Represents a shader at the API level, before state-specific variants are
876 * generated.
877 */
878 struct ir3_shader {
879 gl_shader_stage type;
880
881 /* shader id (for debug): */
882 uint32_t id;
883 uint32_t variant_count;
884
885 /* Set by freedreno after shader_state_create, so we can emit debug info
886 * when recompiling a shader at draw time.
887 */
888 bool initial_variants_done;
889
890 struct ir3_compiler *compiler;
891
892 struct ir3_shader_options options;
893
894 bool nir_finalized;
895 struct nir_shader *nir;
896 struct ir3_stream_output_info stream_output;
897
898 /* per shader stage specific info: */
899 union {
900 /* for compute shaders: */
901 struct {
902 unsigned req_input_mem; /* in dwords */
903 unsigned req_local_mem;
904 } cs;
905 /* For vertex shaders: */
906 struct {
907 /* If we need to generate a passthrough TCS, it will be a function of
908 * (a) the VS and (b) the # of patch_vertices (max 32), so cache them
909 * in the VS keyed by # of patch_vertices-1.
910 */
911 unsigned passthrough_tcs_compiled;
912 struct ir3_shader *passthrough_tcs[32];
913 } vs;
914 };
915
916 struct ir3_shader_variant *variants;
917 mtx_t variants_lock;
918
919 cache_key cache_key; /* shader disk-cache key */
920
921 /* Bitmask of bits of the shader key used by this shader. Used to avoid
922 * recompiles for GL NOS that doesn't actually apply to the shader.
923 */
924 struct ir3_shader_key key_mask;
925 };
926
927 /**
928 * In order to use the same cmdstream, in particular constlen setup and const
929 * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's
930 * corresponding draw pass shaders const_state.
931 */
932 static inline struct ir3_const_state *
ir3_const_state(const struct ir3_shader_variant * v)933 ir3_const_state(const struct ir3_shader_variant *v)
934 {
935 if (v->binning_pass)
936 return v->nonbinning->const_state;
937 return v->const_state;
938 }
939
940 static inline unsigned
_ir3_max_const(const struct ir3_shader_variant * v,bool safe_constlen)941 _ir3_max_const(const struct ir3_shader_variant *v, bool safe_constlen)
942 {
943 const struct ir3_compiler *compiler = v->compiler;
944 bool shared_consts_enable =
945 ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
946
947 /* Shared consts size for CS and FS matches with what's acutally used,
948 * but the size of shared consts for geomtry stages doesn't.
949 * So we use a hw quirk for geometry shared consts.
950 */
951 uint32_t shared_consts_size = shared_consts_enable ?
952 compiler->shared_consts_size : 0;
953
954 uint32_t shared_consts_size_geom = shared_consts_enable ?
955 compiler->geom_shared_consts_size_quirk : 0;
956
957 uint32_t safe_shared_consts_size = shared_consts_enable ?
958 ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4),
959 DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0;
960
961 if ((v->type == MESA_SHADER_COMPUTE) ||
962 (v->type == MESA_SHADER_KERNEL)) {
963 return compiler->max_const_compute - shared_consts_size;
964 } else if (safe_constlen) {
965 return compiler->max_const_safe - safe_shared_consts_size;
966 } else if (v->type == MESA_SHADER_FRAGMENT) {
967 return compiler->max_const_frag - shared_consts_size;
968 } else {
969 return compiler->max_const_geom - shared_consts_size_geom;
970 }
971 }
972
973 /* Given a variant, calculate the maximum constlen it can have.
974 */
975 static inline unsigned
ir3_max_const(const struct ir3_shader_variant * v)976 ir3_max_const(const struct ir3_shader_variant *v)
977 {
978 return _ir3_max_const(v, v->key.safe_constlen);
979 }
980
981 /* Return true if a variant may need to be recompiled due to exceeding the
982 * maximum "safe" constlen.
983 */
984 static inline bool
ir3_exceeds_safe_constlen(const struct ir3_shader_variant * v)985 ir3_exceeds_safe_constlen(const struct ir3_shader_variant *v)
986 {
987 return v->constlen > _ir3_max_const(v, true);
988 }
989
990 void *ir3_shader_assemble(struct ir3_shader_variant *v);
991 struct ir3_shader_variant *
992 ir3_shader_create_variant(struct ir3_shader *shader,
993 const struct ir3_shader_key *key,
994 bool keep_ir);
995 struct ir3_shader_variant *
996 ir3_shader_get_variant(struct ir3_shader *shader,
997 const struct ir3_shader_key *key, bool binning_pass,
998 bool keep_ir, bool *created);
999
1000 struct ir3_shader *
1001 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
1002 const struct ir3_shader_options *options,
1003 struct ir3_stream_output_info *stream_output);
1004 uint32_t ir3_trim_constlen(const struct ir3_shader_variant **variants,
1005 const struct ir3_compiler *compiler);
1006 struct ir3_shader *
1007 ir3_shader_passthrough_tcs(struct ir3_shader *vs, unsigned patch_vertices);
1008 void ir3_shader_destroy(struct ir3_shader *shader);
1009 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
1010 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
1011
1012 int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
1013
1014 /*
1015 * Helper/util:
1016 */
1017
1018 /* clears shader-key flags which don't apply to the given shader.
1019 */
1020 static inline void
ir3_key_clear_unused(struct ir3_shader_key * key,struct ir3_shader * shader)1021 ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
1022 {
1023 uint32_t *key_bits = (uint32_t *)key;
1024 uint32_t *key_mask = (uint32_t *)&shader->key_mask;
1025 STATIC_ASSERT(sizeof(*key) % 4 == 0);
1026 for (int i = 0; i < sizeof(*key) >> 2; i++)
1027 key_bits[i] &= key_mask[i];
1028 }
1029
1030 static inline int
ir3_find_output(const struct ir3_shader_variant * so,gl_varying_slot slot)1031 ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
1032 {
1033 int j;
1034
1035 for (j = 0; j < so->outputs_count; j++)
1036 if (so->outputs[j].slot == slot)
1037 return j;
1038
1039 /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
1040 * in the vertex shader.. but the fragment shader doesn't know this
1041 * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So
1042 * at link time if there is no matching OUT.BCOLOR[n], we must map
1043 * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only
1044 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
1045 */
1046 if (slot == VARYING_SLOT_BFC0) {
1047 slot = VARYING_SLOT_COL0;
1048 } else if (slot == VARYING_SLOT_BFC1) {
1049 slot = VARYING_SLOT_COL1;
1050 } else if (slot == VARYING_SLOT_COL0) {
1051 slot = VARYING_SLOT_BFC0;
1052 } else if (slot == VARYING_SLOT_COL1) {
1053 slot = VARYING_SLOT_BFC1;
1054 } else {
1055 return -1;
1056 }
1057
1058 for (j = 0; j < so->outputs_count; j++)
1059 if (so->outputs[j].slot == slot)
1060 return j;
1061
1062 return -1;
1063 }
1064
1065 static inline int
ir3_next_varying(const struct ir3_shader_variant * so,int i)1066 ir3_next_varying(const struct ir3_shader_variant *so, int i)
1067 {
1068 while (++i < so->inputs_count)
1069 if (so->inputs[i].compmask && so->inputs[i].bary)
1070 break;
1071 return i;
1072 }
1073
1074 static inline int
ir3_find_input(const struct ir3_shader_variant * so,gl_varying_slot slot)1075 ir3_find_input(const struct ir3_shader_variant *so, gl_varying_slot slot)
1076 {
1077 int j = -1;
1078
1079 while (true) {
1080 j = ir3_next_varying(so, j);
1081
1082 if (j >= so->inputs_count)
1083 return -1;
1084
1085 if (so->inputs[j].slot == slot)
1086 return j;
1087 }
1088 }
1089
1090 static inline unsigned
ir3_find_input_loc(const struct ir3_shader_variant * so,gl_varying_slot slot)1091 ir3_find_input_loc(const struct ir3_shader_variant *so, gl_varying_slot slot)
1092 {
1093 int var = ir3_find_input(so, slot);
1094 return var == -1 ? 0xff : so->inputs[var].inloc;
1095 }
1096
1097 struct ir3_shader_linkage {
1098 /* Maximum location either consumed by the fragment shader or produced by
1099 * the last geometry stage, i.e. the size required for each vertex in the
1100 * VPC in DWORD's.
1101 */
1102 uint8_t max_loc;
1103
1104 /* Number of entries in var. */
1105 uint8_t cnt;
1106
1107 /* Bitset of locations used, including ones which are only used by the FS.
1108 */
1109 uint32_t varmask[4];
1110
1111 /* Map from VS output to location. */
1112 struct {
1113 uint8_t slot;
1114 uint8_t regid;
1115 uint8_t compmask;
1116 uint8_t loc;
1117 } var[32];
1118
1119 /* location for fixed-function gl_PrimitiveID passthrough */
1120 uint8_t primid_loc;
1121
1122 /* location for fixed-function gl_ViewIndex passthrough */
1123 uint8_t viewid_loc;
1124
1125 /* location for combined clip/cull distance arrays */
1126 uint8_t clip0_loc, clip1_loc;
1127 };
1128
1129 static inline void
ir3_link_add(struct ir3_shader_linkage * l,uint8_t slot,uint8_t regid_,uint8_t compmask,uint8_t loc)1130 ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
1131 uint8_t compmask, uint8_t loc)
1132 {
1133 for (int j = 0; j < util_last_bit(compmask); j++) {
1134 uint8_t comploc = loc + j;
1135 l->varmask[comploc / 32] |= 1 << (comploc % 32);
1136 }
1137
1138 l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
1139
1140 if (regid_ != regid(63, 0)) {
1141 int i = l->cnt++;
1142 assert(i < ARRAY_SIZE(l->var));
1143
1144 l->var[i].slot = slot;
1145 l->var[i].regid = regid_;
1146 l->var[i].compmask = compmask;
1147 l->var[i].loc = loc;
1148 }
1149 }
1150
1151 static inline void
ir3_link_shaders(struct ir3_shader_linkage * l,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * fs,bool pack_vs_out)1152 ir3_link_shaders(struct ir3_shader_linkage *l,
1153 const struct ir3_shader_variant *vs,
1154 const struct ir3_shader_variant *fs, bool pack_vs_out)
1155 {
1156 /* On older platforms, varmask isn't programmed at all, and it appears
1157 * that the hardware generates a mask of used VPC locations using the VS
1158 * output map, and hangs if a FS bary instruction references a location
1159 * not in the list. This means that we need to have a dummy entry in the
1160 * VS out map for things like gl_PointCoord which aren't written by the
1161 * VS. Furthermore we can't use r63.x, so just pick a random register to
1162 * use if there is no VS output.
1163 */
1164 const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
1165 int j = -1, k;
1166
1167 l->primid_loc = 0xff;
1168 l->viewid_loc = 0xff;
1169 l->clip0_loc = 0xff;
1170 l->clip1_loc = 0xff;
1171
1172 while (l->cnt < ARRAY_SIZE(l->var)) {
1173 j = ir3_next_varying(fs, j);
1174
1175 if (j >= fs->inputs_count)
1176 break;
1177
1178 if (fs->inputs[j].inloc >= fs->total_in)
1179 continue;
1180
1181 k = ir3_find_output(vs, (gl_varying_slot)fs->inputs[j].slot);
1182
1183 if (fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
1184 l->primid_loc = fs->inputs[j].inloc;
1185 }
1186
1187 if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
1188 assert(k < 0);
1189 l->viewid_loc = fs->inputs[j].inloc;
1190 }
1191
1192 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
1193 l->clip0_loc = fs->inputs[j].inloc;
1194
1195 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
1196 l->clip1_loc = fs->inputs[j].inloc;
1197
1198 ir3_link_add(l, fs->inputs[j].slot,
1199 k >= 0 ? vs->outputs[k].regid : default_regid,
1200 fs->inputs[j].compmask, fs->inputs[j].inloc);
1201 }
1202 }
1203
1204 static inline uint32_t
ir3_find_output_regid(const struct ir3_shader_variant * so,unsigned slot)1205 ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
1206 {
1207 int j;
1208 for (j = 0; j < so->outputs_count; j++)
1209 if (so->outputs[j].slot == slot) {
1210 uint32_t regid = so->outputs[j].regid;
1211 if (so->outputs[j].half)
1212 regid |= HALF_REG_ID;
1213 return regid;
1214 }
1215 return regid(63, 0);
1216 }
1217
1218 void print_raw(FILE *out, const BITSET_WORD *data, size_t size);
1219
1220 void ir3_link_stream_out(struct ir3_shader_linkage *l,
1221 const struct ir3_shader_variant *v);
1222
1223 #define VARYING_SLOT_GS_HEADER_IR3 (VARYING_SLOT_MAX + 0)
1224 #define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
1225 #define VARYING_SLOT_TCS_HEADER_IR3 (VARYING_SLOT_MAX + 2)
1226 #define VARYING_SLOT_REL_PATCH_ID_IR3 (VARYING_SLOT_MAX + 3)
1227
1228 static inline uint32_t
ir3_find_sysval_regid(const struct ir3_shader_variant * so,unsigned slot)1229 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
1230 {
1231 if (!so)
1232 return regid(63, 0);
1233 for (int j = 0; j < so->inputs_count; j++)
1234 if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
1235 return so->inputs[j].regid;
1236 return regid(63, 0);
1237 }
1238
1239 /* calculate register footprint in terms of half-regs (ie. one full
1240 * reg counts as two half-regs).
1241 */
1242 static inline uint32_t
ir3_shader_halfregs(const struct ir3_shader_variant * v)1243 ir3_shader_halfregs(const struct ir3_shader_variant *v)
1244 {
1245 return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
1246 }
1247
1248 static inline uint32_t
ir3_shader_nibo(const struct ir3_shader_variant * v)1249 ir3_shader_nibo(const struct ir3_shader_variant *v)
1250 {
1251 return v->num_ibos;
1252 }
1253
1254 static inline uint32_t
ir3_shader_branchstack_hw(const struct ir3_shader_variant * v)1255 ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
1256 {
1257 /* Dummy shader */
1258 if (!v->compiler)
1259 return 0;
1260
1261 if (v->compiler->gen < 5)
1262 return v->branchstack;
1263
1264 return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
1265 }
1266
1267 ENDC;
1268
1269 #endif /* IR3_SHADER_H_ */
1270