1 /*
2 * Copyright © 2010 - 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #pragma once
25
26 #include <stdio.h>
27 #include "c11/threads.h"
28 #include "dev/intel_device_info.h"
29 #include "isl/isl.h"
30 #include "util/macros.h"
31 #include "util/mesa-sha1.h"
32 #include "util/enum_operators.h"
33 #include "util/ralloc.h"
34 #include "util/u_math.h"
35 #include "elk_isa_info.h"
36 #include "../intel_shader_enums.h"
37
38 #ifdef __cplusplus
39 extern "C" {
40 #endif
41
42 struct ra_regs;
43 struct nir_shader;
44 struct shader_info;
45
46 struct nir_shader_compiler_options;
47 typedef struct nir_shader nir_shader;
48
49 #define REG_CLASS_COUNT 20
50
51 struct elk_compiler {
52 const struct intel_device_info *devinfo;
53
54 /* This lock must be taken if the compiler is to be modified in any way,
55 * including adding something to the ralloc child list.
56 */
57 mtx_t mutex;
58
59 struct elk_isa_info isa;
60
61 struct {
62 struct ra_regs *regs;
63
64 /**
65 * Array of the ra classes for the unaligned contiguous register
66 * block sizes used.
67 */
68 struct ra_class **classes;
69 } vec4_reg_set;
70
71 struct {
72 struct ra_regs *regs;
73
74 /**
75 * Array of the ra classes for the unaligned contiguous register
76 * block sizes used, indexed by register size.
77 */
78 struct ra_class *classes[REG_CLASS_COUNT];
79
80 /**
81 * ra class for the aligned barycentrics we use for PLN, which doesn't
82 * appear in *classes.
83 */
84 struct ra_class *aligned_bary_class;
85 } fs_reg_sets[3];
86
87 void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
88 void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
89
90 bool scalar_stage[MESA_ALL_SHADER_STAGES];
91 struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];
92
93 /**
94 * Apply workarounds for SIN and COS output range problems.
95 * This can negatively impact performance.
96 */
97 bool precise_trig;
98
99 /**
100 * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State
101 * Base Address? (If not, it's a normal GPU address.)
102 */
103 bool constant_buffer_0_is_relative;
104
105 /**
106 * Whether or not the driver supports NIR shader constants. This controls
107 * whether nir_opt_large_constants will be run.
108 */
109 bool supports_shader_constants;
110
111 /**
112 * Whether indirect UBO loads should use the sampler or go through the
113 * data/constant cache. For the sampler, UBO surface states have to be set
114 * up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the
115 * constant or data cache, UBOs must use VK_FORMAT_RAW.
116 */
117 bool indirect_ubos_use_sampler;
118
119 /**
120 * Calling the ra_allocate function after each register spill can take
121 * several minutes. This option speeds up shader compilation by spilling
122 * more registers after the ra_allocate failure. Required for
123 * Cyberpunk 2077, which uses a watchdog thread to terminate the process
124 * in case the render thread hasn't responded within 2 minutes.
125 */
126 int spilling_rate;
127 };
128
129 #define elk_shader_debug_log(compiler, data, fmt, ... ) do { \
130 static unsigned id = 0; \
131 compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__); \
132 } while (0)
133
134 #define elk_shader_perf_log(compiler, data, fmt, ... ) do { \
135 static unsigned id = 0; \
136 compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__); \
137 } while (0)
138
139 /**
140 * We use a constant subgroup size of 32. It really only needs to be a
141 * maximum and, since we do SIMD32 for compute shaders in some cases, it
142 * needs to be at least 32. SIMD8 and SIMD16 shaders will still claim a
143 * subgroup size of 32 but will act as if 16 or 24 of those channels are
144 * disabled.
145 */
146 #define ELK_SUBGROUP_SIZE 32
147
148 /**
149 * Program key structures.
150 *
151 * When drawing, we look for the currently bound shaders in the program
152 * cache. This is essentially a hash table lookup, and these are the keys.
153 *
154 * Sometimes OpenGL features specified as state need to be simulated via
155 * shader code, due to a mismatch between the API and the hardware. This
156 * is often referred to as "non-orthagonal state" or "NOS". We store NOS
157 * in the program key so it's considered when searching for a program. If
158 * we haven't seen a particular combination before, we have to recompile a
159 * new specialized version.
160 *
161 * Shader compilation should not look up state in gl_context directly, but
162 * instead use the copy in the program key. This guarantees recompiles will
163 * happen correctly.
164 *
165 * @{
166 */
167
168 enum PACKED elk_gfx6_gather_sampler_wa {
169 ELK_WA_SIGN = 1, /* whether we need to sign extend */
170 ELK_WA_8BIT = 2, /* if we have an 8bit format needing wa */
171 ELK_WA_16BIT = 4, /* if we have a 16bit format needing wa */
172 };
173
174 #define ELK_MAX_SAMPLERS 32
175
176 /* Provide explicit padding for each member, to ensure that the compiler
177 * initializes every bit in the shader cache keys. The keys will be compared
178 * with memcmp.
179 */
180 PRAGMA_DIAGNOSTIC_PUSH
181 PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
182
183 /**
184 * Sampler information needed by VS, WM, and GS program cache keys.
185 */
186 struct elk_sampler_prog_key_data {
187 /**
188 * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
189 *
190 * This field is not consumed by the back-end compiler and is only relevant
191 * for the crocus OpenGL driver for Broadwell and earlier hardware.
192 */
193 uint16_t swizzles[ELK_MAX_SAMPLERS];
194
195 uint32_t gl_clamp_mask[3];
196
197 /**
198 * For RG32F, gather4's channel select is broken.
199 */
200 uint32_t gather_channel_quirk_mask;
201
202 /**
203 * For Sandybridge, which shader w/a we need for gather quirks.
204 */
205 enum elk_gfx6_gather_sampler_wa gfx6_gather_wa[ELK_MAX_SAMPLERS];
206 };
207
208 enum elk_robustness_flags {
209 ELK_ROBUSTNESS_UBO = BITFIELD_BIT(0),
210 ELK_ROBUSTNESS_SSBO = BITFIELD_BIT(1),
211 };
212
213 struct elk_base_prog_key {
214 unsigned program_string_id;
215
216 enum elk_robustness_flags robust_flags:2;
217
218 unsigned padding:22;
219
220 /**
221 * Apply workarounds for SIN and COS input range problems.
222 * This limits input range for SIN and COS to [-2p : 2p] to
223 * avoid precision issues.
224 */
225 bool limit_trig_input_range;
226
227 struct elk_sampler_prog_key_data tex;
228 };
229
230 /**
231 * The VF can't natively handle certain types of attributes, such as GL_FIXED
232 * or most 10_10_10_2 types. These flags enable various VS workarounds to
233 * "fix" attributes at the beginning of shaders.
234 */
235 #define ELK_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */
236 #define ELK_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */
237 #define ELK_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */
238 #define ELK_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */
239 #define ELK_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */
240
241 /**
242 * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range
243 * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user
244 * input vertex attributes. In Vulkan, we expose up to 28 user vertex input
245 * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0.
246 */
247 #define MAX_GL_VERT_ATTRIB VERT_ATTRIB_MAX
248 #define MAX_VK_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 28)
249
250 /**
251 * Max number of binding table entries used for stream output.
252 *
253 * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the
254 * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64.
255 *
256 * On Gfx6, the size of transform feedback data is limited not by the number
257 * of components but by the number of binding table entries we set aside. We
258 * use one binding table entry for a float, one entry for a vector, and one
259 * entry per matrix column. Since the only way we can communicate our
260 * transform feedback capabilities to the client is via
261 * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the
262 * worst case, in which all the varyings are floats, so we use up one binding
263 * table entry per component. Therefore we need to set aside at least 64
264 * binding table entries for use by transform feedback.
265 *
266 * Note: since we don't currently pack varyings, it is currently impossible
267 * for the client to actually use up all of these binding table entries--if
268 * all of their varyings were floats, they would run out of varying slots and
269 * fail to link. But that's a bug, so it seems prudent to go ahead and
270 * allocate the number of binding table entries we will need once the bug is
271 * fixed.
272 */
273 #define ELK_MAX_SOL_BINDINGS 64
274
275 /** The program key for Vertex Shaders. */
276 struct elk_vs_prog_key {
277 struct elk_base_prog_key base;
278
279 /**
280 * Per-attribute workaround flags
281 *
282 * For each attribute, a combination of ELK_ATTRIB_WA_*.
283 *
284 * For OpenGL, where we expose a maximum of 16 user input attributes
285 * we only need up to VERT_ATTRIB_MAX slots, however, in Vulkan
286 * slots preceding VERT_ATTRIB_GENERIC0 are unused and we can
287 * expose up to 28 user input vertex attributes that are mapped to slots
288 * starting at VERT_ATTRIB_GENERIC0, so this array needs to be large
289 * enough to hold this many slots.
290 */
291 uint8_t gl_attrib_wa_flags[MAX2(MAX_GL_VERT_ATTRIB, MAX_VK_VERT_ATTRIB)];
292
293 /**
294 * For pre-Gfx6 hardware, a bitfield indicating which texture coordinates
295 * are going to be replaced with point coordinates (as a consequence of a
296 * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because
297 * our SF thread requires exact matching between VS outputs and FS inputs,
298 * these texture coordinates will need to be unconditionally included in
299 * the VUE, even if they aren't written by the vertex shader.
300 */
301 uint8_t point_coord_replace;
302 unsigned clamp_pointsize:1;
303
304 bool copy_edgeflag:1;
305
306 bool clamp_vertex_color:1;
307
308 /**
309 * How many user clipping planes are being uploaded to the vertex shader as
310 * push constants.
311 *
312 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
313 * clip distances.
314 */
315 unsigned nr_userclip_plane_consts:4;
316
317 uint32_t padding: 25;
318 };
319
320 /** The program key for Tessellation Control Shaders. */
321 struct elk_tcs_prog_key
322 {
323 struct elk_base_prog_key base;
324
325 /** A bitfield of per-vertex outputs written. */
326 uint64_t outputs_written;
327
328 enum tess_primitive_mode _tes_primitive_mode;
329
330 /** Number of input vertices, 0 means dynamic */
331 unsigned input_vertices;
332
333 /** A bitfield of per-patch outputs written. */
334 uint32_t patch_outputs_written;
335
336 bool quads_workaround;
337 uint32_t padding:24;
338 };
339
340 #define ELK_MAX_TCS_INPUT_VERTICES (32)
341
342 static inline uint32_t
elk_tcs_prog_key_input_vertices(const struct elk_tcs_prog_key * key)343 elk_tcs_prog_key_input_vertices(const struct elk_tcs_prog_key *key)
344 {
345 return key->input_vertices != 0 ?
346 key->input_vertices : ELK_MAX_TCS_INPUT_VERTICES;
347 }
348
349 /** The program key for Tessellation Evaluation Shaders. */
350 struct elk_tes_prog_key
351 {
352 struct elk_base_prog_key base;
353
354 /** A bitfield of per-vertex inputs read. */
355 uint64_t inputs_read;
356
357 /** A bitfield of per-patch inputs read. */
358 uint32_t patch_inputs_read;
359
360 /**
361 * How many user clipping planes are being uploaded to the tessellation
362 * evaluation shader as push constants.
363 *
364 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
365 * clip distances.
366 */
367 unsigned nr_userclip_plane_consts:4;
368 unsigned clamp_pointsize:1;
369 uint32_t padding:27;
370 };
371
372 /** The program key for Geometry Shaders. */
373 struct elk_gs_prog_key
374 {
375 struct elk_base_prog_key base;
376
377 /**
378 * How many user clipping planes are being uploaded to the geometry shader
379 * as push constants.
380 *
381 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
382 * clip distances.
383 */
384 unsigned nr_userclip_plane_consts:4;
385 unsigned clamp_pointsize:1;
386 unsigned padding:27;
387 };
388
389 enum elk_sf_primitive {
390 ELK_SF_PRIM_POINTS = 0,
391 ELK_SF_PRIM_LINES = 1,
392 ELK_SF_PRIM_TRIANGLES = 2,
393 ELK_SF_PRIM_UNFILLED_TRIS = 3,
394 };
395
396 struct elk_sf_prog_key {
397 uint64_t attrs;
398 bool contains_flat_varying;
399 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
400 uint8_t point_sprite_coord_replace;
401 enum elk_sf_primitive primitive:2;
402 bool do_twoside_color:1;
403 bool frontface_ccw:1;
404 bool do_point_sprite:1;
405 bool do_point_coord:1;
406 bool sprite_origin_lower_left:1;
407 bool userclip_active:1;
408 unsigned padding: 32;
409 };
410
411 enum elk_clip_mode {
412 ELK_CLIP_MODE_NORMAL = 0,
413 ELK_CLIP_MODE_CLIP_ALL = 1,
414 ELK_CLIP_MODE_CLIP_NON_REJECTED = 2,
415 ELK_CLIP_MODE_REJECT_ALL = 3,
416 ELK_CLIP_MODE_ACCEPT_ALL = 4,
417 ELK_CLIP_MODE_KERNEL_CLIP = 5,
418 };
419
420 enum elk_clip_fill_mode {
421 ELK_CLIP_FILL_MODE_LINE = 0,
422 ELK_CLIP_FILL_MODE_POINT = 1,
423 ELK_CLIP_FILL_MODE_FILL = 2,
424 ELK_CLIP_FILL_MODE_CULL = 3,
425 };
426
427 /* Note that if unfilled primitives are being emitted, we have to fix
428 * up polygon offset and flatshading at this point:
429 */
430 struct elk_clip_prog_key {
431 uint64_t attrs;
432 float offset_factor;
433 float offset_units;
434 float offset_clamp;
435 bool contains_flat_varying;
436 bool contains_noperspective_varying;
437 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
438 unsigned primitive:4;
439 unsigned nr_userclip:4;
440 bool pv_first:1;
441 bool do_unfilled:1;
442 enum elk_clip_fill_mode fill_cw:2; /* includes cull information */
443 enum elk_clip_fill_mode fill_ccw:2; /* includes cull information */
444 bool offset_cw:1;
445 bool offset_ccw:1;
446 bool copy_bfc_cw:1;
447 bool copy_bfc_ccw:1;
448 enum elk_clip_mode clip_mode:3;
449 uint64_t padding:51;
450 };
451
452 /* A big lookup table is used to figure out which and how many
453 * additional regs will inserted before the main payload in the WM
454 * program execution. These mainly relate to depth and stencil
455 * processing and the early-depth-test optimization.
456 */
457 enum elk_wm_iz_bits {
458 ELK_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1,
459 ELK_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2,
460 ELK_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4,
461 ELK_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8,
462 ELK_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10,
463 ELK_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20,
464 ELK_WM_IZ_BIT_MAX = 0x40
465 };
466
467 enum elk_sometimes {
468 ELK_NEVER = 0,
469 ELK_SOMETIMES,
470 ELK_ALWAYS
471 };
472
473 static inline enum elk_sometimes
elk_sometimes_invert(enum elk_sometimes x)474 elk_sometimes_invert(enum elk_sometimes x)
475 {
476 return (enum elk_sometimes)((int)ELK_ALWAYS - (int)x);
477 }
478
479 /** The program key for Fragment/Pixel Shaders. */
480 struct elk_wm_prog_key {
481 struct elk_base_prog_key base;
482
483 uint64_t input_slots_valid;
484 float alpha_test_ref;
485 uint8_t color_outputs_valid;
486
487 /* Some collection of ELK_WM_IZ_* */
488 uint8_t iz_lookup;
489 bool stats_wm:1;
490 bool flat_shade:1;
491 unsigned nr_color_regions:5;
492 bool emit_alpha_test:1;
493 enum compare_func alpha_test_func:3; /* < For Gfx4/5 MRT alpha test */
494 bool alpha_test_replicate_alpha:1;
495 enum elk_sometimes alpha_to_coverage:2;
496 bool clamp_fragment_color:1;
497
498 bool force_dual_color_blend:1;
499
500 /** Whether or inputs are interpolated at sample rate by default
501 *
502 * This corresponds to the sample shading API bit in Vulkan or OpenGL which
503 * controls how inputs with no interpolation qualifier are interpolated.
504 * This is distinct from the way that using gl_SampleID or similar requires
505 * us to run per-sample. Even when running per-sample due to gl_SampleID,
506 * we may still interpolate unqualified inputs at the pixel center.
507 */
508 enum elk_sometimes persample_interp:2;
509
510 /* Whether or not we are running on a multisampled framebuffer */
511 enum elk_sometimes multisample_fbo:2;
512
513 enum elk_sometimes line_aa:2;
514
515 bool coherent_fb_fetch:1;
516 bool ignore_sample_mask_out:1;
517
518 uint64_t padding:56;
519 };
520
521 struct elk_cs_prog_key {
522 struct elk_base_prog_key base;
523 };
524
525 struct elk_ff_gs_prog_key {
526 uint64_t attrs;
527
528 /**
529 * Map from the index of a transform feedback binding table entry to the
530 * gl_varying_slot that should be streamed out through that binding table
531 * entry.
532 */
533 unsigned char transform_feedback_bindings[ELK_MAX_SOL_BINDINGS];
534
535 /**
536 * Map from the index of a transform feedback binding table entry to the
537 * swizzles that should be used when streaming out data through that
538 * binding table entry.
539 */
540 unsigned char transform_feedback_swizzles[ELK_MAX_SOL_BINDINGS];
541
542 /**
543 * Hardware primitive type being drawn, e.g. _3DPRIM_TRILIST.
544 */
545 unsigned primitive:8;
546
547 unsigned pv_first:1;
548 unsigned need_gs_prog:1;
549
550 /**
551 * Number of varyings that are output to transform feedback.
552 */
553 unsigned num_transform_feedback_bindings:7; /* 0-ELK_MAX_SOL_BINDINGS */
554 uint64_t padding:47;
555 };
556
557 /* elk_any_prog_key is any of the keys that map to an API stage */
558 union elk_any_prog_key {
559 struct elk_base_prog_key base;
560 struct elk_vs_prog_key vs;
561 struct elk_tcs_prog_key tcs;
562 struct elk_tes_prog_key tes;
563 struct elk_gs_prog_key gs;
564 struct elk_wm_prog_key wm;
565 struct elk_cs_prog_key cs;
566 };
567
568 PRAGMA_DIAGNOSTIC_POP
569
570 /** Max number of render targets in a shader */
571 #define ELK_MAX_DRAW_BUFFERS 8
572
573 /**
574 * Binding table index for the first gfx6 SOL binding.
575 */
576 #define ELK_GFX6_SOL_BINDING_START 0
577
578 struct elk_ubo_range
579 {
580 uint16_t block;
581
582 /* In units of 32-byte registers */
583 uint8_t start;
584 uint8_t length;
585 };
586
587 /* We reserve the first 2^16 values for builtins */
588 #define ELK_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0)
589
590 enum elk_param_builtin {
591 ELK_PARAM_BUILTIN_ZERO,
592
593 ELK_PARAM_BUILTIN_CLIP_PLANE_0_X,
594 ELK_PARAM_BUILTIN_CLIP_PLANE_0_Y,
595 ELK_PARAM_BUILTIN_CLIP_PLANE_0_Z,
596 ELK_PARAM_BUILTIN_CLIP_PLANE_0_W,
597 ELK_PARAM_BUILTIN_CLIP_PLANE_1_X,
598 ELK_PARAM_BUILTIN_CLIP_PLANE_1_Y,
599 ELK_PARAM_BUILTIN_CLIP_PLANE_1_Z,
600 ELK_PARAM_BUILTIN_CLIP_PLANE_1_W,
601 ELK_PARAM_BUILTIN_CLIP_PLANE_2_X,
602 ELK_PARAM_BUILTIN_CLIP_PLANE_2_Y,
603 ELK_PARAM_BUILTIN_CLIP_PLANE_2_Z,
604 ELK_PARAM_BUILTIN_CLIP_PLANE_2_W,
605 ELK_PARAM_BUILTIN_CLIP_PLANE_3_X,
606 ELK_PARAM_BUILTIN_CLIP_PLANE_3_Y,
607 ELK_PARAM_BUILTIN_CLIP_PLANE_3_Z,
608 ELK_PARAM_BUILTIN_CLIP_PLANE_3_W,
609 ELK_PARAM_BUILTIN_CLIP_PLANE_4_X,
610 ELK_PARAM_BUILTIN_CLIP_PLANE_4_Y,
611 ELK_PARAM_BUILTIN_CLIP_PLANE_4_Z,
612 ELK_PARAM_BUILTIN_CLIP_PLANE_4_W,
613 ELK_PARAM_BUILTIN_CLIP_PLANE_5_X,
614 ELK_PARAM_BUILTIN_CLIP_PLANE_5_Y,
615 ELK_PARAM_BUILTIN_CLIP_PLANE_5_Z,
616 ELK_PARAM_BUILTIN_CLIP_PLANE_5_W,
617 ELK_PARAM_BUILTIN_CLIP_PLANE_6_X,
618 ELK_PARAM_BUILTIN_CLIP_PLANE_6_Y,
619 ELK_PARAM_BUILTIN_CLIP_PLANE_6_Z,
620 ELK_PARAM_BUILTIN_CLIP_PLANE_6_W,
621 ELK_PARAM_BUILTIN_CLIP_PLANE_7_X,
622 ELK_PARAM_BUILTIN_CLIP_PLANE_7_Y,
623 ELK_PARAM_BUILTIN_CLIP_PLANE_7_Z,
624 ELK_PARAM_BUILTIN_CLIP_PLANE_7_W,
625
626 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X,
627 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y,
628 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z,
629 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_W,
630 ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_X,
631 ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_Y,
632
633 ELK_PARAM_BUILTIN_PATCH_VERTICES_IN,
634
635 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X,
636 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
637 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
638 ELK_PARAM_BUILTIN_SUBGROUP_ID,
639 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
640 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
641 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
642 ELK_PARAM_BUILTIN_WORK_DIM,
643 };
644
645 #define ELK_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
646 (ELK_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp))
647
648 #define ELK_PARAM_BUILTIN_IS_CLIP_PLANE(param) \
649 ((param) >= ELK_PARAM_BUILTIN_CLIP_PLANE_0_X && \
650 (param) <= ELK_PARAM_BUILTIN_CLIP_PLANE_7_W)
651
652 #define ELK_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \
653 (((param) - ELK_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2)
654
655 #define ELK_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \
656 (((param) - ELK_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3)
657
658 enum elk_shader_reloc_id {
659 ELK_SHADER_RELOC_CONST_DATA_ADDR_LOW,
660 ELK_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
661 ELK_SHADER_RELOC_SHADER_START_OFFSET,
662 ELK_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
663 };
664
665 enum elk_shader_reloc_type {
666 /** An arbitrary 32-bit value */
667 ELK_SHADER_RELOC_TYPE_U32,
668 /** A MOV instruction with an immediate source */
669 ELK_SHADER_RELOC_TYPE_MOV_IMM,
670 };
671
672 /** Represents a code relocation
673 *
674 * Relocatable constants are immediates in the code which we want to be able
675 * to replace post-compile with the actual value.
676 */
677 struct elk_shader_reloc {
678 /** The 32-bit ID of the relocatable constant */
679 uint32_t id;
680
681 /** Type of this relocation */
682 enum elk_shader_reloc_type type;
683
684 /** The offset in the shader to the relocated value
685 *
686 * For MOV_IMM relocs, this is an offset to the MOV instruction. This
687 * allows us to do some sanity checking while we update the value.
688 */
689 uint32_t offset;
690
691 /** Value to be added to the relocated value before it is written */
692 uint32_t delta;
693 };
694
695 /** A value to write to a relocation */
696 struct elk_shader_reloc_value {
697 /** The 32-bit ID of the relocatable constant */
698 uint32_t id;
699
700 /** The value with which to replace the relocated immediate */
701 uint32_t value;
702 };
703
704 struct elk_stage_prog_data {
705 struct elk_ubo_range ubo_ranges[4];
706
707 unsigned nr_params; /**< number of float params/constants */
708
709 gl_shader_stage stage;
710
711 /* zero_push_reg is a bitfield which indicates what push registers (if any)
712 * should be zeroed by SW at the start of the shader. The corresponding
713 * push_reg_mask_param specifies the param index (in 32-bit units) where
714 * the actual runtime 64-bit mask will be pushed. The shader will zero
715 * push reg i if
716 *
717 * reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
718 *
719 * If this field is set, elk_compiler::compact_params must be false.
720 */
721 uint64_t zero_push_reg;
722 unsigned push_reg_mask_param;
723
724 unsigned curb_read_length;
725 unsigned total_scratch;
726 unsigned total_shared;
727
728 unsigned program_size;
729
730 unsigned const_data_size;
731 unsigned const_data_offset;
732
733 unsigned num_relocs;
734 const struct elk_shader_reloc *relocs;
735
736 /** Does this program pull from any UBO or other constant buffers? */
737 bool has_ubo_pull;
738
739 /**
740 * Register where the thread expects to find input data from the URB
741 * (typically uniforms, followed by vertex or fragment attributes).
742 */
743 unsigned dispatch_grf_start_reg;
744
745 bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */
746
747 /* 32-bit identifiers for all push/pull parameters. These can be anything
748 * the driver wishes them to be; the core of the back-end compiler simply
749 * re-arranges them. The one restriction is that the bottom 2^16 values
750 * are reserved for builtins defined in the elk_param_builtin enum defined
751 * above.
752 */
753 uint32_t *param;
754
755 /* Whether shader uses atomic operations. */
756 bool uses_atomic_load_store;
757 };
758
759 static inline uint32_t *
elk_stage_prog_data_add_params(struct elk_stage_prog_data * prog_data,unsigned nr_new_params)760 elk_stage_prog_data_add_params(struct elk_stage_prog_data *prog_data,
761 unsigned nr_new_params)
762 {
763 unsigned old_nr_params = prog_data->nr_params;
764 prog_data->nr_params += nr_new_params;
765 prog_data->param = reralloc(ralloc_parent(prog_data->param),
766 prog_data->param, uint32_t,
767 prog_data->nr_params);
768 return prog_data->param + old_nr_params;
769 }
770
771 enum elk_barycentric_mode {
772 ELK_BARYCENTRIC_PERSPECTIVE_PIXEL = 0,
773 ELK_BARYCENTRIC_PERSPECTIVE_CENTROID = 1,
774 ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2,
775 ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3,
776 ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
777 ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5,
778 ELK_BARYCENTRIC_MODE_COUNT = 6
779 };
780 #define ELK_BARYCENTRIC_PERSPECTIVE_BITS \
781 ((1 << ELK_BARYCENTRIC_PERSPECTIVE_PIXEL) | \
782 (1 << ELK_BARYCENTRIC_PERSPECTIVE_CENTROID) | \
783 (1 << ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE))
784 #define ELK_BARYCENTRIC_NONPERSPECTIVE_BITS \
785 ((1 << ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
786 (1 << ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
787 (1 << ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
788
789 enum elk_pixel_shader_computed_depth_mode {
790 ELK_PSCDEPTH_OFF = 0, /* PS does not compute depth */
791 ELK_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */
792 ELK_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
793 ELK_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
794 };
795
796 /* Data about a particular attempt to compile a program. Note that
797 * there can be many of these, each in a different GL state
798 * corresponding to a different elk_wm_prog_key struct, with different
799 * compiled programs.
800 */
801 struct elk_wm_prog_data {
802 struct elk_stage_prog_data base;
803
804 unsigned num_per_primitive_inputs;
805 unsigned num_varying_inputs;
806
807 uint8_t reg_blocks_8;
808 uint8_t reg_blocks_16;
809 uint8_t reg_blocks_32;
810
811 uint8_t dispatch_grf_start_reg_16;
812 uint8_t dispatch_grf_start_reg_32;
813 uint32_t prog_offset_16;
814 uint32_t prog_offset_32;
815
816 struct {
817 /** @{
818 * surface indices the WM-specific surfaces
819 */
820 uint32_t render_target_read_start;
821 /** @} */
822 } binding_table;
823
824 uint8_t color_outputs_written;
825 uint8_t computed_depth_mode;
826
827 bool computed_stencil;
828 bool early_fragment_tests;
829 bool post_depth_coverage;
830 bool inner_coverage;
831 bool dispatch_8;
832 bool dispatch_16;
833 bool dispatch_32;
834 bool dual_src_blend;
835 bool uses_pos_offset;
836 bool uses_omask;
837 bool uses_kill;
838 bool uses_src_depth;
839 bool uses_src_w;
840 bool uses_sample_mask;
841 bool uses_vmask;
842 bool has_side_effects;
843 bool pulls_bary;
844
845 bool contains_flat_varying;
846 bool contains_noperspective_varying;
847
848 /** True if the shader wants sample shading
849 *
850 * This corresponds to whether or not a gl_SampleId, gl_SamplePosition, or
851 * a sample-qualified input are used in the shader. It is independent of
852 * GL_MIN_SAMPLE_SHADING_VALUE in GL or minSampleShading in Vulkan.
853 */
854 bool sample_shading;
855
856 /** Should this shader be dispatched per-sample */
857 enum elk_sometimes persample_dispatch;
858
859 /**
860 * Shader writes the SampleMask and this is AND-ed with the API's
861 * SampleMask to generate a new coverage mask.
862 */
863 enum elk_sometimes alpha_to_coverage;
864
865 unsigned msaa_flags_param;
866
867 /**
868 * Mask of which interpolation modes are required by the fragment shader.
869 * Those interpolations are delivered as part of the thread payload. Used
870 * in hardware setup on gfx6+.
871 */
872 uint32_t barycentric_interp_modes;
873
874 /**
875 * Whether nonperspective interpolation modes are used by the
876 * barycentric_interp_modes or fragment shader through interpolator messages.
877 */
878 bool uses_nonperspective_interp_modes;
879
880 /**
881 * Mask of which FS inputs are marked flat by the shader source. This is
882 * needed for setting up 3DSTATE_SF/SBE.
883 */
884 uint32_t flat_inputs;
885
886 /**
887 * The FS inputs
888 */
889 uint64_t inputs;
890
891 /* Mapping of VUE slots to interpolation modes.
892 * Used by the Gfx4-5 clip/sf/wm stages.
893 */
894 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
895
896 /**
897 * Map from gl_varying_slot to the position within the FS setup data
898 * payload where the varying's attribute vertex deltas should be delivered.
899 * For varying slots that are not used by the FS, the value is -1.
900 */
901 int urb_setup[VARYING_SLOT_MAX];
902 int urb_setup_channel[VARYING_SLOT_MAX];
903
904 /**
905 * Cache structure into the urb_setup array above that contains the
906 * attribute numbers of active varyings out of urb_setup.
907 * The actual count is stored in urb_setup_attribs_count.
908 */
909 uint8_t urb_setup_attribs[VARYING_SLOT_MAX];
910 uint8_t urb_setup_attribs_count;
911 };
912
913 #ifdef GFX_VERx10
914
915 /** Returns the SIMD width corresponding to a given KSP index
916 *
917 * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
918 * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
919 * kernel start pointer (KSP) indices that is based on what dispatch widths
920 * are enabled. This function provides, effectively, the reverse mapping.
921 *
922 * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD
923 * width of 8, 16, or 32 is returned. If the KSP is invalid, 0 is returned.
924 */
925 static inline unsigned
elk_fs_simd_width_for_ksp(unsigned ksp_idx,bool simd8_enabled,bool simd16_enabled,bool simd32_enabled)926 elk_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled,
927 bool simd16_enabled, bool simd32_enabled)
928 {
929 /* This function strictly ignores contiguous dispatch */
930 switch (ksp_idx) {
931 case 0:
932 return simd8_enabled ? 8 :
933 (simd16_enabled && !simd32_enabled) ? 16 :
934 (simd32_enabled && !simd16_enabled) ? 32 : 0;
935 case 1:
936 return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0;
937 case 2:
938 return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0;
939 default:
940 unreachable("Invalid KSP index");
941 }
942 }
943
944 #define elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \
945 elk_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \
946 (wm_state)._16PixelDispatchEnable, \
947 (wm_state)._32PixelDispatchEnable)
948
949 #endif
950
951 #define elk_wm_state_has_ksp(wm_state, ksp_idx) \
952 (elk_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0)
953
954 static inline uint32_t
_elk_wm_prog_data_prog_offset(const struct elk_wm_prog_data * prog_data,unsigned simd_width)955 _elk_wm_prog_data_prog_offset(const struct elk_wm_prog_data *prog_data,
956 unsigned simd_width)
957 {
958 switch (simd_width) {
959 case 8: return 0;
960 case 16: return prog_data->prog_offset_16;
961 case 32: return prog_data->prog_offset_32;
962 default: return 0;
963 }
964 }
965
966 #define elk_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \
967 _elk_wm_prog_data_prog_offset(prog_data, \
968 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
969
970 static inline uint8_t
_elk_wm_prog_data_dispatch_grf_start_reg(const struct elk_wm_prog_data * prog_data,unsigned simd_width)971 _elk_wm_prog_data_dispatch_grf_start_reg(const struct elk_wm_prog_data *prog_data,
972 unsigned simd_width)
973 {
974 switch (simd_width) {
975 case 8: return prog_data->base.dispatch_grf_start_reg;
976 case 16: return prog_data->dispatch_grf_start_reg_16;
977 case 32: return prog_data->dispatch_grf_start_reg_32;
978 default: return 0;
979 }
980 }
981
982 #define elk_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \
983 _elk_wm_prog_data_dispatch_grf_start_reg(prog_data, \
984 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
985
986 static inline uint8_t
_elk_wm_prog_data_reg_blocks(const struct elk_wm_prog_data * prog_data,unsigned simd_width)987 _elk_wm_prog_data_reg_blocks(const struct elk_wm_prog_data *prog_data,
988 unsigned simd_width)
989 {
990 switch (simd_width) {
991 case 8: return prog_data->reg_blocks_8;
992 case 16: return prog_data->reg_blocks_16;
993 case 32: return prog_data->reg_blocks_32;
994 default: return 0;
995 }
996 }
997
998 #define elk_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \
999 _elk_wm_prog_data_reg_blocks(prog_data, \
1000 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
1001
1002 static inline bool
elk_wm_prog_data_is_persample(const struct elk_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)1003 elk_wm_prog_data_is_persample(const struct elk_wm_prog_data *prog_data,
1004 enum intel_msaa_flags pushed_msaa_flags)
1005 {
1006 if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) {
1007 if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO))
1008 return false;
1009
1010 if (prog_data->sample_shading)
1011 assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
1012
1013 if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)
1014 assert(prog_data->persample_dispatch != ELK_NEVER);
1015 else
1016 assert(prog_data->persample_dispatch != ELK_ALWAYS);
1017
1018 return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0;
1019 }
1020
1021 assert(prog_data->persample_dispatch == ELK_ALWAYS ||
1022 prog_data->persample_dispatch == ELK_NEVER);
1023
1024 return prog_data->persample_dispatch;
1025 }
1026
1027 static inline uint32_t
elk_wm_prog_data_barycentric_modes(const struct elk_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)1028 elk_wm_prog_data_barycentric_modes(const struct elk_wm_prog_data *prog_data,
1029 enum intel_msaa_flags pushed_msaa_flags)
1030 {
1031 uint32_t modes = prog_data->barycentric_interp_modes;
1032
1033 /* In the non dynamic case, we can just return the computed modes from
1034 * compilation time.
1035 */
1036 if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC))
1037 return modes;
1038
1039 if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) {
1040 assert(prog_data->persample_dispatch == ELK_ALWAYS ||
1041 (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH));
1042
1043 /* Making dynamic per-sample interpolation work is a bit tricky. The
1044 * hardware will hang if SAMPLE is requested but per-sample dispatch is
1045 * not enabled. This means we can't preemptively add SAMPLE to the
1046 * barycentrics bitfield. Instead, we have to add it late and only
1047 * on-demand. Annoyingly, changing the number of barycentrics requested
1048 * changes the whole PS shader payload so we very much don't want to do
1049 * that. Instead, if the dynamic per-sample interpolation flag is set,
1050 * we check to see if SAMPLE was requested and, if not, replace the
1051 * highest barycentric bit in the [non]perspective grouping (CENTROID,
1052 * if it exists, else PIXEL) with SAMPLE. The shader will stomp all the
1053 * barycentrics in the shader with SAMPLE so it really doesn't matter
1054 * which one we replace. The important thing is that we keep the number
1055 * of barycentrics in each [non]perspective grouping the same.
1056 */
1057 if ((modes & ELK_BARYCENTRIC_PERSPECTIVE_BITS) &&
1058 !(modes & BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE))) {
1059 int sample_mode =
1060 util_last_bit(modes & ELK_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
1061 assert(modes & BITFIELD_BIT(sample_mode));
1062
1063 modes &= ~BITFIELD_BIT(sample_mode);
1064 modes |= BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
1065 }
1066
1067 if ((modes & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) &&
1068 !(modes & BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) {
1069 int sample_mode =
1070 util_last_bit(modes & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
1071 assert(modes & BITFIELD_BIT(sample_mode));
1072
1073 modes &= ~BITFIELD_BIT(sample_mode);
1074 modes |= BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE);
1075 }
1076 } else {
1077 /* If we're not using per-sample interpolation, we need to disable the
1078 * per-sample bits.
1079 *
1080 * SKL PRMs, Volume 2a: Command Reference: Instructions,
1081 * 3DSTATE_WM:Barycentric Interpolation Mode:
1082
1083 * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
1084 * Sample or Non-perspective Sample barycentric coordinates."
1085 */
1086 modes &= ~(BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
1087 BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
1088 }
1089
1090 return modes;
1091 }
1092
1093 struct elk_push_const_block {
1094 unsigned dwords; /* Dword count, not reg aligned */
1095 unsigned regs;
1096 unsigned size; /* Bytes, register aligned */
1097 };
1098
1099 struct elk_cs_prog_data {
1100 struct elk_stage_prog_data base;
1101
1102 unsigned local_size[3];
1103
1104 /* Program offsets for the 8/16/32 SIMD variants. Multiple variants are
1105 * kept when using variable group size, and the right one can only be
1106 * decided at dispatch time.
1107 */
1108 unsigned prog_offset[3];
1109
1110 /* Bitmask indicating which program offsets are valid. */
1111 unsigned prog_mask;
1112
1113 /* Bitmask indicating which programs have spilled. */
1114 unsigned prog_spilled;
1115
1116 bool uses_barrier;
1117 bool uses_num_work_groups;
1118
1119 struct {
1120 struct elk_push_const_block cross_thread;
1121 struct elk_push_const_block per_thread;
1122 } push;
1123
1124 struct {
1125 /** @{
1126 * surface indices the CS-specific surfaces
1127 */
1128 uint32_t work_groups_start;
1129 /** @} */
1130 } binding_table;
1131 };
1132
1133 static inline uint32_t
elk_cs_prog_data_prog_offset(const struct elk_cs_prog_data * prog_data,unsigned dispatch_width)1134 elk_cs_prog_data_prog_offset(const struct elk_cs_prog_data *prog_data,
1135 unsigned dispatch_width)
1136 {
1137 assert(dispatch_width == 8 ||
1138 dispatch_width == 16 ||
1139 dispatch_width == 32);
1140 const unsigned index = dispatch_width / 16;
1141 assert(prog_data->prog_mask & (1 << index));
1142 return prog_data->prog_offset[index];
1143 }
1144
1145 struct elk_ff_gs_prog_data {
1146 unsigned urb_read_length;
1147 unsigned total_grf;
1148
1149 /**
1150 * Gfx6 transform feedback: Amount by which the streaming vertex buffer
1151 * indices should be incremented each time the GS is invoked.
1152 */
1153 unsigned svbi_postincrement_value;
1154 };
1155
1156 /**
1157 * Enum representing the i965-specific vertex results that don't correspond
1158 * exactly to any element of gl_varying_slot. The values of this enum are
1159 * assigned such that they don't conflict with gl_varying_slot.
1160 */
1161 typedef enum
1162 {
1163 ELK_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
1164 ELK_VARYING_SLOT_PAD,
1165 /**
1166 * Technically this is not a varying but just a placeholder that
1167 * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
1168 * builtin variable to be compiled correctly. see compile_sf_prog() for
1169 * more info.
1170 */
1171 ELK_VARYING_SLOT_PNTC,
1172 ELK_VARYING_SLOT_COUNT
1173 } elk_varying_slot;
1174
1175 /**
1176 * We always program SF to start reading at an offset of 1 (2 varying slots)
1177 * from the start of the vertex URB entry. This causes it to skip:
1178 * - VARYING_SLOT_PSIZ and ELK_VARYING_SLOT_NDC on gfx4-5
1179 * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gfx6+
1180 */
1181 #define ELK_SF_URB_ENTRY_READ_OFFSET 1
1182
1183 /**
1184 * Bitmask indicating which fragment shader inputs represent varyings (and
1185 * hence have to be delivered to the fragment shader by the SF/SBE stage).
1186 */
1187 #define ELK_FS_VARYING_INPUT_MASK \
1188 (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \
1189 ~VARYING_BIT_POS & ~VARYING_BIT_FACE)
1190
1191 void elk_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
1192 gl_shader_stage stage);
1193
1194 /**
1195 * Convert a VUE slot number into a byte offset within the VUE.
1196 */
elk_vue_slot_to_offset(unsigned slot)1197 static inline unsigned elk_vue_slot_to_offset(unsigned slot)
1198 {
1199 return 16*slot;
1200 }
1201
1202 /**
1203 * Convert a vertex output (elk_varying_slot) into a byte offset within the
1204 * VUE.
1205 */
1206 static inline unsigned
elk_varying_to_offset(const struct intel_vue_map * vue_map,unsigned varying)1207 elk_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
1208 {
1209 return elk_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
1210 }
1211
1212 void elk_compute_vue_map(const struct intel_device_info *devinfo,
1213 struct intel_vue_map *vue_map,
1214 uint64_t slots_valid,
1215 bool separate_shader,
1216 uint32_t pos_slots);
1217
1218 void elk_compute_tess_vue_map(struct intel_vue_map *const vue_map,
1219 uint64_t slots_valid,
1220 uint32_t is_patch);
1221
1222 /* elk_interpolation_map.c */
1223 void elk_setup_vue_interpolation(const struct intel_vue_map *vue_map,
1224 struct nir_shader *nir,
1225 struct elk_wm_prog_data *prog_data);
1226
1227 struct elk_vue_prog_data {
1228 struct elk_stage_prog_data base;
1229 struct intel_vue_map vue_map;
1230
1231 /** Should the hardware deliver input VUE handles for URB pull loads? */
1232 bool include_vue_handles;
1233
1234 unsigned urb_read_length;
1235 unsigned total_grf;
1236
1237 uint32_t clip_distance_mask;
1238 uint32_t cull_distance_mask;
1239
1240 /* Used for calculating urb partitions. In the VS, this is the size of the
1241 * URB entry used for both input and output to the thread. In the GS, this
1242 * is the size of the URB entry used for output.
1243 */
1244 unsigned urb_entry_size;
1245
1246 enum intel_shader_dispatch_mode dispatch_mode;
1247 };
1248
1249 struct elk_vs_prog_data {
1250 struct elk_vue_prog_data base;
1251
1252 uint64_t inputs_read;
1253 uint64_t double_inputs_read;
1254
1255 unsigned nr_attribute_slots;
1256
1257 bool uses_vertexid;
1258 bool uses_instanceid;
1259 bool uses_is_indexed_draw;
1260 bool uses_firstvertex;
1261 bool uses_baseinstance;
1262 bool uses_drawid;
1263 };
1264
1265 struct elk_tcs_prog_data
1266 {
1267 struct elk_vue_prog_data base;
1268
1269 /** Should the non-SINGLE_PATCH payload provide primitive ID? */
1270 bool include_primitive_id;
1271
1272 /** Number vertices in output patch */
1273 int instances;
1274
1275 /** Track patch count threshold */
1276 int patch_count_threshold;
1277 };
1278
1279
1280 struct elk_tes_prog_data
1281 {
1282 struct elk_vue_prog_data base;
1283
1284 enum intel_tess_partitioning partitioning;
1285 enum intel_tess_output_topology output_topology;
1286 enum intel_tess_domain domain;
1287 bool include_primitive_id;
1288 };
1289
1290 struct elk_gs_prog_data
1291 {
1292 struct elk_vue_prog_data base;
1293
1294 unsigned vertices_in;
1295
1296 /**
1297 * Size of an output vertex, measured in HWORDS (32 bytes).
1298 */
1299 unsigned output_vertex_size_hwords;
1300
1301 unsigned output_topology;
1302
1303 /**
1304 * Size of the control data (cut bits or StreamID bits), in hwords (32
1305 * bytes). 0 if there is no control data.
1306 */
1307 unsigned control_data_header_size_hwords;
1308
1309 /**
1310 * Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
1311 * if the control data is StreamID bits, or
1312 * GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
1313 * Ignored if control_data_header_size is 0.
1314 */
1315 unsigned control_data_format;
1316
1317 bool include_primitive_id;
1318
1319 /**
1320 * The number of vertices emitted, if constant - otherwise -1.
1321 */
1322 int static_vertex_count;
1323
1324 int invocations;
1325
1326 /**
1327 * Gfx6: Provoking vertex convention for odd-numbered triangles
1328 * in tristrips.
1329 */
1330 unsigned pv_first:1;
1331
1332 /**
1333 * Gfx6: Number of varyings that are output to transform feedback.
1334 */
1335 unsigned num_transform_feedback_bindings:7; /* 0-ELK_MAX_SOL_BINDINGS */
1336
1337 /**
1338 * Gfx6: Map from the index of a transform feedback binding table entry to the
1339 * gl_varying_slot that should be streamed out through that binding table
1340 * entry.
1341 */
1342 unsigned char transform_feedback_bindings[64 /* ELK_MAX_SOL_BINDINGS */];
1343
1344 /**
1345 * Gfx6: Map from the index of a transform feedback binding table entry to the
1346 * swizzles that should be used when streaming out data through that
1347 * binding table entry.
1348 */
1349 unsigned char transform_feedback_swizzles[64 /* ELK_MAX_SOL_BINDINGS */];
1350 };
1351
1352 struct elk_sf_prog_data {
1353 uint32_t urb_read_length;
1354 uint32_t total_grf;
1355
1356 /* Each vertex may have up to 12 attributes, 4 components each,
1357 * except WPOS which requires only 2. (11*4 + 2) == 44 ==> 11
1358 * rows.
1359 *
1360 * Actually we use 4 for each, so call it 12 rows.
1361 */
1362 unsigned urb_entry_size;
1363 };
1364
1365 struct elk_clip_prog_data {
1366 uint32_t curb_read_length; /* user planes? */
1367 uint32_t clip_mode;
1368 uint32_t urb_read_length;
1369 uint32_t total_grf;
1370 };
1371
1372 /* elk_any_prog_data is prog_data for any stage that maps to an API stage */
1373 union elk_any_prog_data {
1374 struct elk_stage_prog_data base;
1375 struct elk_vue_prog_data vue;
1376 struct elk_vs_prog_data vs;
1377 struct elk_tcs_prog_data tcs;
1378 struct elk_tes_prog_data tes;
1379 struct elk_gs_prog_data gs;
1380 struct elk_wm_prog_data wm;
1381 struct elk_cs_prog_data cs;
1382 };
1383
1384 #define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK) \
1385 static inline struct elk_##STAGE##_prog_data * \
1386 elk_##STAGE##_prog_data(struct elk_stage_prog_data *prog_data) \
1387 { \
1388 if (prog_data) \
1389 assert(CHECK); \
1390 return (struct elk_##STAGE##_prog_data *) prog_data; \
1391 } \
1392 static inline const struct elk_##STAGE##_prog_data * \
1393 elk_##STAGE##_prog_data_const(const struct elk_stage_prog_data *prog_data) \
1394 { \
1395 if (prog_data) \
1396 assert(CHECK); \
1397 return (const struct elk_##STAGE##_prog_data *) prog_data; \
1398 }
1399
1400 DEFINE_PROG_DATA_DOWNCAST(vs, prog_data->stage == MESA_SHADER_VERTEX)
1401 DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL)
1402 DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL)
1403 DEFINE_PROG_DATA_DOWNCAST(gs, prog_data->stage == MESA_SHADER_GEOMETRY)
1404 DEFINE_PROG_DATA_DOWNCAST(wm, prog_data->stage == MESA_SHADER_FRAGMENT)
1405 DEFINE_PROG_DATA_DOWNCAST(cs, gl_shader_stage_uses_workgroup(prog_data->stage))
1406
1407 DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX ||
1408 prog_data->stage == MESA_SHADER_TESS_CTRL ||
1409 prog_data->stage == MESA_SHADER_TESS_EVAL ||
1410 prog_data->stage == MESA_SHADER_GEOMETRY)
1411
1412 /* These are not really elk_stage_prog_data. */
1413 DEFINE_PROG_DATA_DOWNCAST(ff_gs, true)
1414 DEFINE_PROG_DATA_DOWNCAST(clip, true)
1415 DEFINE_PROG_DATA_DOWNCAST(sf, true)
1416 #undef DEFINE_PROG_DATA_DOWNCAST
1417
1418 struct elk_compile_stats {
1419 uint32_t dispatch_width; /**< 0 for vec4 */
1420 uint32_t max_polygons;
1421 uint32_t max_dispatch_width;
1422 uint32_t instructions;
1423 uint32_t sends;
1424 uint32_t loops;
1425 uint32_t cycles;
1426 uint32_t spills;
1427 uint32_t fills;
1428 uint32_t max_live_registers;
1429 };
1430
1431 /** @} */
1432
1433 struct elk_compiler *
1434 elk_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo);
1435
1436 /**
1437 * Returns a compiler configuration for use with disk shader cache
1438 *
1439 * This value only needs to change for settings that can cause different
1440 * program generation between two runs on the same hardware.
1441 *
1442 * For example, it doesn't need to be different for gen 8 and gen 9 hardware,
1443 * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used.
1444 */
1445 uint64_t
1446 elk_get_compiler_config_value(const struct elk_compiler *compiler);
1447
1448 unsigned
1449 elk_prog_data_size(gl_shader_stage stage);
1450
1451 unsigned
1452 elk_prog_key_size(gl_shader_stage stage);
1453
1454 struct elk_compile_params {
1455 void *mem_ctx;
1456
1457 nir_shader *nir;
1458
1459 struct elk_compile_stats *stats;
1460
1461 void *log_data;
1462
1463 char *error_str;
1464
1465 uint64_t debug_flag;
1466
1467 uint32_t source_hash;
1468 };
1469
1470 /**
1471 * Parameters for compiling a vertex shader.
1472 *
1473 * Some of these will be modified during the shader compilation.
1474 */
1475 struct elk_compile_vs_params {
1476 struct elk_compile_params base;
1477
1478 const struct elk_vs_prog_key *key;
1479 struct elk_vs_prog_data *prog_data;
1480
1481 bool edgeflag_is_last; /* true for gallium */
1482 };
1483
1484 /**
1485 * Compile a vertex shader.
1486 *
1487 * Returns the final assembly and updates the parameters structure.
1488 */
1489 const unsigned *
1490 elk_compile_vs(const struct elk_compiler *compiler,
1491 struct elk_compile_vs_params *params);
1492
1493 /**
1494 * Parameters for compiling a tessellation control shader.
1495 *
1496 * Some of these will be modified during the shader compilation.
1497 */
1498 struct elk_compile_tcs_params {
1499 struct elk_compile_params base;
1500
1501 const struct elk_tcs_prog_key *key;
1502 struct elk_tcs_prog_data *prog_data;
1503 };
1504
1505 /**
1506 * Compile a tessellation control shader.
1507 *
1508 * Returns the final assembly and updates the parameters structure.
1509 */
1510 const unsigned *
1511 elk_compile_tcs(const struct elk_compiler *compiler,
1512 struct elk_compile_tcs_params *params);
1513
1514 /**
1515 * Parameters for compiling a tessellation evaluation shader.
1516 *
1517 * Some of these will be modified during the shader compilation.
1518 */
1519 struct elk_compile_tes_params {
1520 struct elk_compile_params base;
1521
1522 const struct elk_tes_prog_key *key;
1523 struct elk_tes_prog_data *prog_data;
1524 const struct intel_vue_map *input_vue_map;
1525 };
1526
1527 /**
1528 * Compile a tessellation evaluation shader.
1529 *
1530 * Returns the final assembly and updates the parameters structure.
1531 */
1532 const unsigned *
1533 elk_compile_tes(const struct elk_compiler *compiler,
1534 struct elk_compile_tes_params *params);
1535
1536 /**
1537 * Parameters for compiling a geometry shader.
1538 *
1539 * Some of these will be modified during the shader compilation.
1540 */
1541 struct elk_compile_gs_params {
1542 struct elk_compile_params base;
1543
1544 const struct elk_gs_prog_key *key;
1545 struct elk_gs_prog_data *prog_data;
1546 };
1547
1548 /**
1549 * Compile a geometry shader.
1550 *
1551 * Returns the final assembly and updates the parameters structure.
1552 */
1553 const unsigned *
1554 elk_compile_gs(const struct elk_compiler *compiler,
1555 struct elk_compile_gs_params *params);
1556
1557 /**
1558 * Compile a strips and fans shader.
1559 *
1560 * This is a fixed-function shader determined entirely by the shader key and
1561 * a VUE map.
1562 *
1563 * Returns the final assembly and the program's size.
1564 */
1565 const unsigned *
1566 elk_compile_sf(const struct elk_compiler *compiler,
1567 void *mem_ctx,
1568 const struct elk_sf_prog_key *key,
1569 struct elk_sf_prog_data *prog_data,
1570 struct intel_vue_map *vue_map,
1571 unsigned *final_assembly_size);
1572
1573 /**
1574 * Compile a clipper shader.
1575 *
1576 * This is a fixed-function shader determined entirely by the shader key and
1577 * a VUE map.
1578 *
1579 * Returns the final assembly and the program's size.
1580 */
1581 const unsigned *
1582 elk_compile_clip(const struct elk_compiler *compiler,
1583 void *mem_ctx,
1584 const struct elk_clip_prog_key *key,
1585 struct elk_clip_prog_data *prog_data,
1586 struct intel_vue_map *vue_map,
1587 unsigned *final_assembly_size);
1588
1589 /**
1590 * Parameters for compiling a fragment shader.
1591 *
1592 * Some of these will be modified during the shader compilation.
1593 */
1594 struct elk_compile_fs_params {
1595 struct elk_compile_params base;
1596
1597 const struct elk_wm_prog_key *key;
1598 struct elk_wm_prog_data *prog_data;
1599
1600 const struct intel_vue_map *vue_map;
1601 const struct elk_mue_map *mue_map;
1602
1603 bool allow_spilling;
1604 bool use_rep_send;
1605 uint8_t max_polygons;
1606 };
1607
1608 /**
1609 * Compile a fragment shader.
1610 *
1611 * Returns the final assembly and updates the parameters structure.
1612 */
1613 const unsigned *
1614 elk_compile_fs(const struct elk_compiler *compiler,
1615 struct elk_compile_fs_params *params);
1616
1617 /**
1618 * Parameters for compiling a compute shader.
1619 *
1620 * Some of these will be modified during the shader compilation.
1621 */
1622 struct elk_compile_cs_params {
1623 struct elk_compile_params base;
1624
1625 const struct elk_cs_prog_key *key;
1626 struct elk_cs_prog_data *prog_data;
1627 };
1628
1629 /**
1630 * Compile a compute shader.
1631 *
1632 * Returns the final assembly and updates the parameters structure.
1633 */
1634 const unsigned *
1635 elk_compile_cs(const struct elk_compiler *compiler,
1636 struct elk_compile_cs_params *params);
1637
1638 /**
1639 * Compile a fixed function geometry shader.
1640 *
1641 * Returns the final assembly and the program's size.
1642 */
1643 const unsigned *
1644 elk_compile_ff_gs_prog(struct elk_compiler *compiler,
1645 void *mem_ctx,
1646 const struct elk_ff_gs_prog_key *key,
1647 struct elk_ff_gs_prog_data *prog_data,
1648 struct intel_vue_map *vue_map,
1649 unsigned *final_assembly_size);
1650
1651 void elk_debug_key_recompile(const struct elk_compiler *c, void *log,
1652 gl_shader_stage stage,
1653 const struct elk_base_prog_key *old_key,
1654 const struct elk_base_prog_key *key);
1655
1656 unsigned
1657 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
1658 unsigned threads);
1659
1660 void
1661 elk_write_shader_relocs(const struct elk_isa_info *isa,
1662 void *program,
1663 const struct elk_stage_prog_data *prog_data,
1664 struct elk_shader_reloc_value *values,
1665 unsigned num_values);
1666
1667 /**
1668 * Get the dispatch information for a shader to be used with GPGPU_WALKER and
1669 * similar instructions.
1670 *
1671 * If override_local_size is not NULL, it must to point to a 3-element that
1672 * will override the value from prog_data->local_size. This is used by
1673 * ARB_compute_variable_group_size, where the size is set only at dispatch
1674 * time (so prog_data is outdated).
1675 */
1676 struct intel_cs_dispatch_info
1677 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1678 const struct elk_cs_prog_data *prog_data,
1679 const unsigned *override_local_size);
1680
1681 /**
1682 * Return true if the given shader stage is dispatched contiguously by the
1683 * relevant fixed function starting from channel 0 of the SIMD thread, which
1684 * implies that the dispatch mask of a thread can be assumed to have the form
1685 * '2^n - 1' for some n.
1686 */
1687 static inline bool
elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info * devinfo,gl_shader_stage stage,const struct elk_stage_prog_data * prog_data)1688 elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
1689 gl_shader_stage stage,
1690 const struct elk_stage_prog_data *prog_data)
1691 {
1692 /* The code below makes assumptions about the hardware's thread dispatch
1693 * behavior that could be proven wrong in future generations -- Make sure
1694 * to do a full test run with elk_fs_test_dispatch_packing() hooked up to
1695 * the NIR front-end before changing this assertion.
1696 */
1697 assert(devinfo->ver <= 8);
1698
1699 switch (stage) {
1700 case MESA_SHADER_FRAGMENT: {
1701 /* The PSD discards subspans coming in with no lit samples, which in the
1702 * per-pixel shading case implies that each subspan will either be fully
1703 * lit (due to the VMask being used to allow derivative computations),
1704 * or not dispatched at all. In per-sample dispatch mode individual
1705 * samples from the same subspan have a fixed relative location within
1706 * the SIMD thread, so dispatch of unlit samples cannot be avoided in
1707 * general and we should return false.
1708 */
1709 const struct elk_wm_prog_data *wm_prog_data =
1710 (const struct elk_wm_prog_data *)prog_data;
1711 return !wm_prog_data->persample_dispatch &&
1712 wm_prog_data->uses_vmask;
1713 }
1714 case MESA_SHADER_COMPUTE:
1715 /* Compute shaders will be spawned with either a fully enabled dispatch
1716 * mask or with whatever bottom/right execution mask was given to the
1717 * GPGPU walker command to be used along the workgroup edges -- In both
1718 * cases the dispatch mask is required to be tightly packed for our
1719 * invocation index calculations to work.
1720 */
1721 return true;
1722 default:
1723 /* Most remaining fixed functions are limited to use a packed dispatch
1724 * mask due to the hardware representation of the dispatch mask as a
1725 * single counter representing the number of enabled channels.
1726 */
1727 return true;
1728 }
1729 }
1730
1731 /**
1732 * Computes the first varying slot in the URB produced by the previous stage
1733 * that is used in the next stage. We do this by testing the varying slots in
1734 * the previous stage's vue map against the inputs read in the next stage.
1735 *
1736 * Note that:
1737 *
1738 * - Each URB offset contains two varying slots and we can only skip a
1739 * full offset if both slots are unused, so the value we return here is always
1740 * rounded down to the closest multiple of two.
1741 *
1742 * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are
1743 * part of the vue header, so if these are read we can't skip anything.
1744 */
1745 static inline int
elk_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)1746 elk_compute_first_urb_slot_required(uint64_t inputs_read,
1747 const struct intel_vue_map *prev_stage_vue_map)
1748 {
1749 if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE)) == 0) {
1750 for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
1751 int varying = prev_stage_vue_map->slot_to_varying[i];
1752 if (varying != ELK_VARYING_SLOT_PAD && varying > 0 &&
1753 varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0)
1754 return ROUND_DOWN_TO(i, 2);
1755 }
1756 }
1757
1758 return 0;
1759 }
1760
1761 #ifdef __cplusplus
1762 } /* extern "C" */
1763 #endif
1764