1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_nir.h"
25 #include "brw_shader.h"
26 #include "dev/gen_debug.h"
27 #include "compiler/glsl_types.h"
28 #include "compiler/nir/nir_builder.h"
29 #include "util/u_math.h"
30
31 static bool
remap_tess_levels(nir_builder * b,nir_intrinsic_instr * intr,GLenum primitive_mode)32 remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr,
33 GLenum primitive_mode)
34 {
35 const int location = nir_intrinsic_base(intr);
36 const unsigned component = nir_intrinsic_component(intr);
37 bool out_of_bounds;
38
39 if (location == VARYING_SLOT_TESS_LEVEL_INNER) {
40 switch (primitive_mode) {
41 case GL_QUADS:
42 /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */
43 nir_intrinsic_set_base(intr, 0);
44 nir_intrinsic_set_component(intr, 3 - component);
45 out_of_bounds = false;
46 break;
47 case GL_TRIANGLES:
48 /* gl_TessLevelInner[0] lives at DWord 4. */
49 nir_intrinsic_set_base(intr, 1);
50 out_of_bounds = component > 0;
51 break;
52 case GL_ISOLINES:
53 out_of_bounds = true;
54 break;
55 default:
56 unreachable("Bogus tessellation domain");
57 }
58 } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) {
59 if (primitive_mode == GL_ISOLINES) {
60 /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */
61 nir_intrinsic_set_base(intr, 1);
62 nir_intrinsic_set_component(intr, 2 + nir_intrinsic_component(intr));
63 out_of_bounds = component > 1;
64 } else {
65 /* Triangles use DWords 7-5 (reversed); Quads use 7-4 (reversed) */
66 nir_intrinsic_set_base(intr, 1);
67 nir_intrinsic_set_component(intr, 3 - nir_intrinsic_component(intr));
68 out_of_bounds = component == 3 && primitive_mode == GL_TRIANGLES;
69 }
70 } else {
71 return false;
72 }
73
74 if (out_of_bounds) {
75 if (nir_intrinsic_infos[intr->intrinsic].has_dest) {
76 b->cursor = nir_before_instr(&intr->instr);
77 nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
78 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(undef));
79 }
80 nir_instr_remove(&intr->instr);
81 }
82
83 return true;
84 }
85
86 static bool
is_input(nir_intrinsic_instr * intrin)87 is_input(nir_intrinsic_instr *intrin)
88 {
89 return intrin->intrinsic == nir_intrinsic_load_input ||
90 intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
91 intrin->intrinsic == nir_intrinsic_load_interpolated_input;
92 }
93
94 static bool
is_output(nir_intrinsic_instr * intrin)95 is_output(nir_intrinsic_instr *intrin)
96 {
97 return intrin->intrinsic == nir_intrinsic_load_output ||
98 intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
99 intrin->intrinsic == nir_intrinsic_store_output ||
100 intrin->intrinsic == nir_intrinsic_store_per_vertex_output;
101 }
102
103
104 static bool
remap_patch_urb_offsets(nir_block * block,nir_builder * b,const struct brw_vue_map * vue_map,GLenum tes_primitive_mode)105 remap_patch_urb_offsets(nir_block *block, nir_builder *b,
106 const struct brw_vue_map *vue_map,
107 GLenum tes_primitive_mode)
108 {
109 const bool is_passthrough_tcs = b->shader->info.name &&
110 strcmp(b->shader->info.name, "passthrough") == 0;
111
112 nir_foreach_instr_safe(instr, block) {
113 if (instr->type != nir_instr_type_intrinsic)
114 continue;
115
116 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
117
118 gl_shader_stage stage = b->shader->info.stage;
119
120 if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
121 (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
122
123 if (!is_passthrough_tcs &&
124 remap_tess_levels(b, intrin, tes_primitive_mode))
125 continue;
126
127 int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
128 assert(vue_slot != -1);
129 intrin->const_index[0] = vue_slot;
130
131 nir_src *vertex = nir_get_io_vertex_index_src(intrin);
132 if (vertex) {
133 if (nir_src_is_const(*vertex)) {
134 intrin->const_index[0] += nir_src_as_uint(*vertex) *
135 vue_map->num_per_vertex_slots;
136 } else {
137 b->cursor = nir_before_instr(&intrin->instr);
138
139 /* Multiply by the number of per-vertex slots. */
140 nir_ssa_def *vertex_offset =
141 nir_imul(b,
142 nir_ssa_for_src(b, *vertex, 1),
143 nir_imm_int(b,
144 vue_map->num_per_vertex_slots));
145
146 /* Add it to the existing offset */
147 nir_src *offset = nir_get_io_offset_src(intrin);
148 nir_ssa_def *total_offset =
149 nir_iadd(b, vertex_offset,
150 nir_ssa_for_src(b, *offset, 1));
151
152 nir_instr_rewrite_src(&intrin->instr, offset,
153 nir_src_for_ssa(total_offset));
154 }
155 }
156 }
157 }
158 return true;
159 }
160
161 void
brw_nir_lower_vs_inputs(nir_shader * nir,const uint8_t * vs_attrib_wa_flags)162 brw_nir_lower_vs_inputs(nir_shader *nir,
163 const uint8_t *vs_attrib_wa_flags)
164 {
165 /* Start with the location of the variable's base. */
166 nir_foreach_shader_in_variable(var, nir)
167 var->data.driver_location = var->data.location;
168
169 /* Now use nir_lower_io to walk dereference chains. Attribute arrays are
170 * loaded as one vec4 or dvec4 per element (or matrix column), depending on
171 * whether it is a double-precision type or not.
172 */
173 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
174 nir_lower_io_lower_64bit_to_32);
175
176 /* This pass needs actual constants */
177 nir_opt_constant_folding(nir);
178
179 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
180
181 brw_nir_apply_attribute_workarounds(nir, vs_attrib_wa_flags);
182
183 /* The last step is to remap VERT_ATTRIB_* to actual registers */
184
185 /* Whether or not we have any system generated values. gl_DrawID is not
186 * included here as it lives in its own vec4.
187 */
188 const bool has_sgvs =
189 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
190 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
191 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
192 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
193
194 const unsigned num_inputs = util_bitcount64(nir->info.inputs_read);
195
196 nir_foreach_function(function, nir) {
197 if (!function->impl)
198 continue;
199
200 nir_builder b;
201 nir_builder_init(&b, function->impl);
202
203 nir_foreach_block(block, function->impl) {
204 nir_foreach_instr_safe(instr, block) {
205 if (instr->type != nir_instr_type_intrinsic)
206 continue;
207
208 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
209
210 switch (intrin->intrinsic) {
211 case nir_intrinsic_load_first_vertex:
212 case nir_intrinsic_load_base_instance:
213 case nir_intrinsic_load_vertex_id_zero_base:
214 case nir_intrinsic_load_instance_id:
215 case nir_intrinsic_load_is_indexed_draw:
216 case nir_intrinsic_load_draw_id: {
217 b.cursor = nir_after_instr(&intrin->instr);
218
219 /* gl_VertexID and friends are stored by the VF as the last
220 * vertex element. We convert them to load_input intrinsics at
221 * the right location.
222 */
223 nir_intrinsic_instr *load =
224 nir_intrinsic_instr_create(nir, nir_intrinsic_load_input);
225 load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
226
227 nir_intrinsic_set_base(load, num_inputs);
228 switch (intrin->intrinsic) {
229 case nir_intrinsic_load_first_vertex:
230 nir_intrinsic_set_component(load, 0);
231 break;
232 case nir_intrinsic_load_base_instance:
233 nir_intrinsic_set_component(load, 1);
234 break;
235 case nir_intrinsic_load_vertex_id_zero_base:
236 nir_intrinsic_set_component(load, 2);
237 break;
238 case nir_intrinsic_load_instance_id:
239 nir_intrinsic_set_component(load, 3);
240 break;
241 case nir_intrinsic_load_draw_id:
242 case nir_intrinsic_load_is_indexed_draw:
243 /* gl_DrawID and IsIndexedDraw are stored right after
244 * gl_VertexID and friends if any of them exist.
245 */
246 nir_intrinsic_set_base(load, num_inputs + has_sgvs);
247 if (intrin->intrinsic == nir_intrinsic_load_draw_id)
248 nir_intrinsic_set_component(load, 0);
249 else
250 nir_intrinsic_set_component(load, 1);
251 break;
252 default:
253 unreachable("Invalid system value intrinsic");
254 }
255
256 load->num_components = 1;
257 nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
258 nir_builder_instr_insert(&b, &load->instr);
259
260 nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
261 nir_src_for_ssa(&load->dest.ssa));
262 nir_instr_remove(&intrin->instr);
263 break;
264 }
265
266 case nir_intrinsic_load_input: {
267 /* Attributes come in a contiguous block, ordered by their
268 * gl_vert_attrib value. That means we can compute the slot
269 * number for an attribute by masking out the enabled attributes
270 * before it and counting the bits.
271 */
272 int attr = nir_intrinsic_base(intrin);
273 int slot = util_bitcount64(nir->info.inputs_read &
274 BITFIELD64_MASK(attr));
275 nir_intrinsic_set_base(intrin, slot);
276 break;
277 }
278
279 default:
280 break; /* Nothing to do */
281 }
282 }
283 }
284 }
285 }
286
287 void
brw_nir_lower_vue_inputs(nir_shader * nir,const struct brw_vue_map * vue_map)288 brw_nir_lower_vue_inputs(nir_shader *nir,
289 const struct brw_vue_map *vue_map)
290 {
291 nir_foreach_shader_in_variable(var, nir)
292 var->data.driver_location = var->data.location;
293
294 /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
295 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
296 nir_lower_io_lower_64bit_to_32);
297
298 /* This pass needs actual constants */
299 nir_opt_constant_folding(nir);
300
301 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
302
303 nir_foreach_function(function, nir) {
304 if (!function->impl)
305 continue;
306
307 nir_foreach_block(block, function->impl) {
308 nir_foreach_instr(instr, block) {
309 if (instr->type != nir_instr_type_intrinsic)
310 continue;
311
312 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
313
314 if (intrin->intrinsic == nir_intrinsic_load_input ||
315 intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
316 /* Offset 0 is the VUE header, which contains
317 * VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and
318 * VARYING_SLOT_PSIZ [.w].
319 */
320 int varying = nir_intrinsic_base(intrin);
321 int vue_slot;
322 switch (varying) {
323 case VARYING_SLOT_PSIZ:
324 nir_intrinsic_set_base(intrin, 0);
325 nir_intrinsic_set_component(intrin, 3);
326 break;
327
328 default:
329 vue_slot = vue_map->varying_to_slot[varying];
330 assert(vue_slot != -1);
331 nir_intrinsic_set_base(intrin, vue_slot);
332 break;
333 }
334 }
335 }
336 }
337 }
338 }
339
340 void
brw_nir_lower_tes_inputs(nir_shader * nir,const struct brw_vue_map * vue_map)341 brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue_map)
342 {
343 nir_foreach_shader_in_variable(var, nir)
344 var->data.driver_location = var->data.location;
345
346 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
347 nir_lower_io_lower_64bit_to_32);
348
349 /* This pass needs actual constants */
350 nir_opt_constant_folding(nir);
351
352 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
353
354 nir_foreach_function(function, nir) {
355 if (function->impl) {
356 nir_builder b;
357 nir_builder_init(&b, function->impl);
358 nir_foreach_block(block, function->impl) {
359 remap_patch_urb_offsets(block, &b, vue_map,
360 nir->info.tess.primitive_mode);
361 }
362 }
363 }
364 }
365
366 void
brw_nir_lower_fs_inputs(nir_shader * nir,const struct gen_device_info * devinfo,const struct brw_wm_prog_key * key)367 brw_nir_lower_fs_inputs(nir_shader *nir,
368 const struct gen_device_info *devinfo,
369 const struct brw_wm_prog_key *key)
370 {
371 nir_foreach_shader_in_variable(var, nir) {
372 var->data.driver_location = var->data.location;
373
374 /* Apply default interpolation mode.
375 *
376 * Everything defaults to smooth except for the legacy GL color
377 * built-in variables, which might be flat depending on API state.
378 */
379 if (var->data.interpolation == INTERP_MODE_NONE) {
380 const bool flat = key->flat_shade &&
381 (var->data.location == VARYING_SLOT_COL0 ||
382 var->data.location == VARYING_SLOT_COL1);
383
384 var->data.interpolation = flat ? INTERP_MODE_FLAT
385 : INTERP_MODE_SMOOTH;
386 }
387
388 /* On Ironlake and below, there is only one interpolation mode.
389 * Centroid interpolation doesn't mean anything on this hardware --
390 * there is no multisampling.
391 */
392 if (devinfo->gen < 6) {
393 var->data.centroid = false;
394 var->data.sample = false;
395 }
396 }
397
398 nir_lower_io_options lower_io_options = nir_lower_io_lower_64bit_to_32;
399 if (key->persample_interp)
400 lower_io_options |= nir_lower_io_force_sample_interpolation;
401
402 nir_lower_io(nir, nir_var_shader_in, type_size_vec4, lower_io_options);
403 if (devinfo->gen >= 11)
404 nir_lower_interpolation(nir, ~0);
405
406 /* This pass needs actual constants */
407 nir_opt_constant_folding(nir);
408
409 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
410 }
411
412 void
brw_nir_lower_vue_outputs(nir_shader * nir)413 brw_nir_lower_vue_outputs(nir_shader *nir)
414 {
415 nir_foreach_shader_out_variable(var, nir) {
416 var->data.driver_location = var->data.location;
417 }
418
419 nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
420 nir_lower_io_lower_64bit_to_32);
421 }
422
423 void
brw_nir_lower_tcs_outputs(nir_shader * nir,const struct brw_vue_map * vue_map,GLenum tes_primitive_mode)424 brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue_map,
425 GLenum tes_primitive_mode)
426 {
427 nir_foreach_shader_out_variable(var, nir) {
428 var->data.driver_location = var->data.location;
429 }
430
431 nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
432 nir_lower_io_lower_64bit_to_32);
433
434 /* This pass needs actual constants */
435 nir_opt_constant_folding(nir);
436
437 nir_io_add_const_offset_to_base(nir, nir_var_shader_out);
438
439 nir_foreach_function(function, nir) {
440 if (function->impl) {
441 nir_builder b;
442 nir_builder_init(&b, function->impl);
443 nir_foreach_block(block, function->impl) {
444 remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode);
445 }
446 }
447 }
448 }
449
450 void
brw_nir_lower_fs_outputs(nir_shader * nir)451 brw_nir_lower_fs_outputs(nir_shader *nir)
452 {
453 nir_foreach_shader_out_variable(var, nir) {
454 var->data.driver_location =
455 SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) |
456 SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION);
457 }
458
459 nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0);
460 }
461
462 #define OPT(pass, ...) ({ \
463 bool this_progress = false; \
464 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
465 if (this_progress) \
466 progress = true; \
467 this_progress; \
468 })
469
470 static nir_variable_mode
brw_nir_no_indirect_mask(const struct brw_compiler * compiler,gl_shader_stage stage)471 brw_nir_no_indirect_mask(const struct brw_compiler *compiler,
472 gl_shader_stage stage)
473 {
474 const struct gen_device_info *devinfo = compiler->devinfo;
475 const bool is_scalar = compiler->scalar_stage[stage];
476 nir_variable_mode indirect_mask = 0;
477
478 switch (stage) {
479 case MESA_SHADER_VERTEX:
480 case MESA_SHADER_FRAGMENT:
481 indirect_mask |= nir_var_shader_in;
482 break;
483
484 case MESA_SHADER_GEOMETRY:
485 if (!is_scalar)
486 indirect_mask |= nir_var_shader_in;
487 break;
488
489 default:
490 /* Everything else can handle indirect inputs */
491 break;
492 }
493
494 if (is_scalar && stage != MESA_SHADER_TESS_CTRL)
495 indirect_mask |= nir_var_shader_out;
496
497 /* On HSW+, we allow indirects in scalar shaders. They get implemented
498 * using nir_lower_vars_to_explicit_types and nir_lower_explicit_io in
499 * brw_postprocess_nir.
500 *
501 * We haven't plumbed through the indirect scratch messages on gen6 or
502 * earlier so doing indirects via scratch doesn't work there. On gen7 and
503 * earlier the scratch space size is limited to 12kB. If we allowed
504 * indirects as scratch all the time, we may easily exceed this limit
505 * without having any fallback.
506 */
507 if (is_scalar && devinfo->gen <= 7 && !devinfo->is_haswell)
508 indirect_mask |= nir_var_function_temp;
509
510 return indirect_mask;
511 }
512
513 void
brw_nir_optimize(nir_shader * nir,const struct brw_compiler * compiler,bool is_scalar,bool allow_copies)514 brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
515 bool is_scalar, bool allow_copies)
516 {
517 nir_variable_mode loop_indirect_mask =
518 brw_nir_no_indirect_mask(compiler, nir->info.stage);
519
520 /* We can handle indirects via scratch messages. However, they are
521 * expensive so we'd rather not if we can avoid it. Have loop unrolling
522 * try to get rid of them.
523 */
524 if (is_scalar)
525 loop_indirect_mask |= nir_var_function_temp;
526
527 bool progress;
528 unsigned lower_flrp =
529 (nir->options->lower_flrp16 ? 16 : 0) |
530 (nir->options->lower_flrp32 ? 32 : 0) |
531 (nir->options->lower_flrp64 ? 64 : 0);
532
533 do {
534 progress = false;
535 OPT(nir_split_array_vars, nir_var_function_temp);
536 OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
537 OPT(nir_opt_deref);
538 OPT(nir_lower_vars_to_ssa);
539 if (allow_copies) {
540 /* Only run this pass in the first call to brw_nir_optimize. Later
541 * calls assume that we've lowered away any copy_deref instructions
542 * and we don't want to introduce any more.
543 */
544 OPT(nir_opt_find_array_copies);
545 }
546 OPT(nir_opt_copy_prop_vars);
547 OPT(nir_opt_dead_write_vars);
548 OPT(nir_opt_combine_stores, nir_var_all);
549
550 if (is_scalar) {
551 OPT(nir_lower_alu_to_scalar, NULL, NULL);
552 } else {
553 OPT(nir_opt_shrink_vectors);
554 }
555
556 OPT(nir_copy_prop);
557
558 if (is_scalar) {
559 OPT(nir_lower_phis_to_scalar);
560 }
561
562 OPT(nir_copy_prop);
563 OPT(nir_opt_dce);
564 OPT(nir_opt_cse);
565 OPT(nir_opt_combine_stores, nir_var_all);
566
567 /* Passing 0 to the peephole select pass causes it to convert
568 * if-statements that contain only move instructions in the branches
569 * regardless of the count.
570 *
571 * Passing 1 to the peephole select pass causes it to convert
572 * if-statements that contain at most a single ALU instruction (total)
573 * in both branches. Before Gen6, some math instructions were
574 * prohibitively expensive and the results of compare operations need an
575 * extra resolve step. For these reasons, this pass is more harmful
576 * than good on those platforms.
577 *
578 * For indirect loads of uniforms (push constants), we assume that array
579 * indices will nearly always be in bounds and the cost of the load is
580 * low. Therefore there shouldn't be a performance benefit to avoid it.
581 * However, in vec4 tessellation shaders, these loads operate by
582 * actually pulling from memory.
583 */
584 const bool is_vec4_tessellation = !is_scalar &&
585 (nir->info.stage == MESA_SHADER_TESS_CTRL ||
586 nir->info.stage == MESA_SHADER_TESS_EVAL);
587 OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
588 OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation,
589 compiler->devinfo->gen >= 6);
590
591 OPT(nir_opt_intrinsics);
592 OPT(nir_opt_idiv_const, 32);
593 OPT(nir_opt_algebraic);
594 OPT(nir_opt_constant_folding);
595
596 if (lower_flrp != 0) {
597 if (OPT(nir_lower_flrp,
598 lower_flrp,
599 false /* always_precise */)) {
600 OPT(nir_opt_constant_folding);
601 }
602
603 /* Nothing should rematerialize any flrps, so we only need to do this
604 * lowering once.
605 */
606 lower_flrp = 0;
607 }
608
609 OPT(nir_opt_dead_cf);
610 if (OPT(nir_opt_trivial_continues)) {
611 /* If nir_opt_trivial_continues makes progress, then we need to clean
612 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
613 * to make progress.
614 */
615 OPT(nir_copy_prop);
616 OPT(nir_opt_dce);
617 }
618 OPT(nir_opt_if, false);
619 OPT(nir_opt_conditional_discard);
620 if (nir->options->max_unroll_iterations != 0) {
621 OPT(nir_opt_loop_unroll, loop_indirect_mask);
622 }
623 OPT(nir_opt_remove_phis);
624 OPT(nir_opt_undef);
625 OPT(nir_lower_pack);
626 } while (progress);
627
628 /* Workaround Gfxbench unused local sampler variable which will trigger an
629 * assert in the opt_large_constants pass.
630 */
631 OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
632 }
633
634 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * data)635 lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
636 {
637 const struct brw_compiler *compiler = (const struct brw_compiler *) data;
638 const struct gen_device_info *devinfo = compiler->devinfo;
639
640 switch (instr->type) {
641 case nir_instr_type_alu: {
642 nir_alu_instr *alu = nir_instr_as_alu(instr);
643 assert(alu->dest.dest.is_ssa);
644 if (alu->dest.dest.ssa.bit_size >= 32)
645 return 0;
646
647 switch (alu->op) {
648 case nir_op_idiv:
649 case nir_op_imod:
650 case nir_op_irem:
651 case nir_op_udiv:
652 case nir_op_umod:
653 case nir_op_fceil:
654 case nir_op_ffloor:
655 case nir_op_ffract:
656 case nir_op_fround_even:
657 case nir_op_ftrunc:
658 return 32;
659 case nir_op_frcp:
660 case nir_op_frsq:
661 case nir_op_fsqrt:
662 case nir_op_fpow:
663 case nir_op_fexp2:
664 case nir_op_flog2:
665 case nir_op_fsin:
666 case nir_op_fcos:
667 return devinfo->gen < 9 ? 32 : 0;
668 default:
669 if (devinfo->gen >= 11) {
670 if (nir_op_infos[alu->op].num_inputs >= 2 &&
671 alu->dest.dest.ssa.bit_size == 8)
672 return 16;
673
674 if (nir_alu_instr_is_comparison(alu) &&
675 alu->src[0].src.ssa->bit_size == 8)
676 return 16;
677 }
678 return 0;
679 }
680 break;
681 }
682
683 case nir_instr_type_intrinsic: {
684 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
685 switch (intrin->intrinsic) {
686 case nir_intrinsic_read_invocation:
687 case nir_intrinsic_read_first_invocation:
688 case nir_intrinsic_vote_feq:
689 case nir_intrinsic_vote_ieq:
690 case nir_intrinsic_shuffle:
691 case nir_intrinsic_shuffle_xor:
692 case nir_intrinsic_shuffle_up:
693 case nir_intrinsic_shuffle_down:
694 case nir_intrinsic_quad_broadcast:
695 case nir_intrinsic_quad_swap_horizontal:
696 case nir_intrinsic_quad_swap_vertical:
697 case nir_intrinsic_quad_swap_diagonal:
698 if (intrin->src[0].ssa->bit_size == 8 && devinfo->gen >= 11)
699 return 16;
700 return 0;
701
702 case nir_intrinsic_reduce:
703 case nir_intrinsic_inclusive_scan:
704 case nir_intrinsic_exclusive_scan:
705 /* There are a couple of register region issues that make things
706 * complicated for 8-bit types:
707 *
708 * 1. Only raw moves are allowed to write to a packed 8-bit
709 * destination.
710 * 2. If we use a strided destination, the efficient way to do
711 * scan operations ends up using strides that are too big to
712 * encode in an instruction.
713 *
714 * To get around these issues, we just do all 8-bit scan operations
715 * in 16 bits. It's actually fewer instructions than what we'd have
716 * to do if we were trying to do it in native 8-bit types and the
717 * results are the same once we truncate to 8 bits at the end.
718 */
719 if (intrin->dest.ssa.bit_size == 8)
720 return 16;
721 return 0;
722
723 default:
724 return 0;
725 }
726 break;
727 }
728
729 default:
730 return 0;
731 }
732 }
733
734 /* Does some simple lowering and runs the standard suite of optimizations
735 *
736 * This is intended to be called more-or-less directly after you get the
737 * shader out of GLSL or some other source. While it is geared towards i965,
738 * it is not at all generator-specific except for the is_scalar flag. Even
739 * there, it is safe to call with is_scalar = false for a shader that is
740 * intended for the FS backend as long as nir_optimize is called again with
741 * is_scalar = true to scalarize everything prior to code gen.
742 */
743 void
brw_preprocess_nir(const struct brw_compiler * compiler,nir_shader * nir,const nir_shader * softfp64)744 brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
745 const nir_shader *softfp64)
746 {
747 const struct gen_device_info *devinfo = compiler->devinfo;
748 UNUSED bool progress; /* Written by OPT */
749
750 const bool is_scalar = compiler->scalar_stage[nir->info.stage];
751
752 nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
753
754 if (is_scalar) {
755 OPT(nir_lower_alu_to_scalar, NULL, NULL);
756 }
757
758 if (nir->info.stage == MESA_SHADER_GEOMETRY)
759 OPT(nir_lower_gs_intrinsics, 0);
760
761 /* See also brw_nir_trig_workarounds.py */
762 if (compiler->precise_trig &&
763 !(devinfo->gen >= 10 || devinfo->is_kabylake))
764 OPT(brw_nir_apply_trig_workarounds);
765
766 if (devinfo->gen >= 12)
767 OPT(brw_nir_clamp_image_1d_2d_array_sizes);
768
769 static const nir_lower_tex_options tex_options = {
770 .lower_txp = ~0,
771 .lower_txf_offset = true,
772 .lower_rect_offset = true,
773 .lower_tex_without_implicit_lod = true,
774 .lower_txd_cube_map = true,
775 .lower_txb_shadow_clamp = true,
776 .lower_txd_shadow_clamp = true,
777 .lower_txd_offset_clamp = true,
778 .lower_tg4_offsets = true,
779 };
780
781 OPT(nir_lower_tex, &tex_options);
782 OPT(nir_normalize_cubemap_coords);
783
784 OPT(nir_lower_global_vars_to_local);
785
786 OPT(nir_split_var_copies);
787 OPT(nir_split_struct_vars, nir_var_function_temp);
788
789 brw_nir_optimize(nir, compiler, is_scalar, true);
790
791 OPT(nir_lower_doubles, softfp64, nir->options->lower_doubles_options);
792 OPT(nir_lower_int64);
793
794 OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
795
796 if (is_scalar) {
797 OPT(nir_lower_load_const_to_scalar);
798 }
799
800 /* Lower a bunch of stuff */
801 OPT(nir_lower_var_copies);
802
803 /* This needs to be run after the first optimization pass but before we
804 * lower indirect derefs away
805 */
806 if (compiler->supports_shader_constants) {
807 OPT(nir_opt_large_constants, NULL, 32);
808 }
809
810 OPT(nir_lower_system_values);
811 OPT(nir_lower_compute_system_values, NULL);
812
813 const nir_lower_subgroups_options subgroups_options = {
814 .ballot_bit_size = 32,
815 .lower_to_scalar = true,
816 .lower_vote_trivial = !is_scalar,
817 .lower_shuffle = true,
818 .lower_quad_broadcast_dynamic = true,
819 .lower_elect = true,
820 };
821 OPT(nir_lower_subgroups, &subgroups_options);
822
823 OPT(nir_lower_clip_cull_distance_arrays);
824
825 nir_variable_mode indirect_mask =
826 brw_nir_no_indirect_mask(compiler, nir->info.stage);
827 OPT(nir_lower_indirect_derefs, indirect_mask, UINT32_MAX);
828
829 /* Even in cases where we can handle indirect temporaries via scratch, we
830 * it can still be expensive. Lower indirects on small arrays to
831 * conditional load/stores.
832 *
833 * The threshold of 16 was chosen semi-arbitrarily. The idea is that an
834 * indirect on an array of 16 elements is about 30 instructions at which
835 * point, you may be better off doing a send. With a SIMD8 program, 16
836 * floats is 1/8 of the entire register file. Any array larger than that
837 * is likely to cause pressure issues. Also, this value is sufficiently
838 * high that the benchmarks known to suffer from large temporary array
839 * issues are helped but nothing else in shader-db is hurt except for maybe
840 * that one kerbal space program shader.
841 */
842 if (is_scalar && !(indirect_mask & nir_var_function_temp))
843 OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);
844
845 /* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and
846 * SSBOs, our back-end is capable of loading an entire vec4 at a time and
847 * we would like to take advantage of that whenever possible regardless of
848 * whether or not the app gives us full loads. This should allow the
849 * optimizer to combine UBO and SSBO load operations and save us some send
850 * messages.
851 */
852 OPT(nir_lower_array_deref_of_vec,
853 nir_var_mem_ubo | nir_var_mem_ssbo,
854 nir_lower_direct_array_deref_of_vec_load);
855
856 /* Get rid of split copies */
857 brw_nir_optimize(nir, compiler, is_scalar, false);
858 }
859
860 void
brw_nir_link_shaders(const struct brw_compiler * compiler,nir_shader * producer,nir_shader * consumer)861 brw_nir_link_shaders(const struct brw_compiler *compiler,
862 nir_shader *producer, nir_shader *consumer)
863 {
864 nir_lower_io_arrays_to_elements(producer, consumer);
865 nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
866 nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
867
868 const bool p_is_scalar = compiler->scalar_stage[producer->info.stage];
869 const bool c_is_scalar = compiler->scalar_stage[consumer->info.stage];
870
871 if (p_is_scalar && c_is_scalar) {
872 NIR_PASS_V(producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
873 NIR_PASS_V(consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
874 brw_nir_optimize(producer, compiler, p_is_scalar, false);
875 brw_nir_optimize(consumer, compiler, c_is_scalar, false);
876 }
877
878 if (nir_link_opt_varyings(producer, consumer))
879 brw_nir_optimize(consumer, compiler, c_is_scalar, false);
880
881 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
882 NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
883
884 if (nir_remove_unused_varyings(producer, consumer)) {
885 NIR_PASS_V(producer, nir_lower_global_vars_to_local);
886 NIR_PASS_V(consumer, nir_lower_global_vars_to_local);
887
888 /* The backend might not be able to handle indirects on
889 * temporaries so we need to lower indirects on any of the
890 * varyings we have demoted here.
891 */
892 NIR_PASS_V(producer, nir_lower_indirect_derefs,
893 brw_nir_no_indirect_mask(compiler, producer->info.stage),
894 UINT32_MAX);
895 NIR_PASS_V(consumer, nir_lower_indirect_derefs,
896 brw_nir_no_indirect_mask(compiler, consumer->info.stage),
897 UINT32_MAX);
898
899 brw_nir_optimize(producer, compiler, p_is_scalar, false);
900 brw_nir_optimize(consumer, compiler, c_is_scalar, false);
901 }
902
903 NIR_PASS_V(producer, nir_lower_io_to_vector, nir_var_shader_out);
904 NIR_PASS_V(producer, nir_opt_combine_stores, nir_var_shader_out);
905 NIR_PASS_V(consumer, nir_lower_io_to_vector, nir_var_shader_in);
906
907 if (producer->info.stage != MESA_SHADER_TESS_CTRL) {
908 /* Calling lower_io_to_vector creates output variable writes with
909 * write-masks. On non-TCS outputs, the back-end can't handle it and we
910 * need to call nir_lower_io_to_temporaries to get rid of them. This,
911 * in turn, creates temporary variables and extra copy_deref intrinsics
912 * that we need to clean up.
913 */
914 NIR_PASS_V(producer, nir_lower_io_to_temporaries,
915 nir_shader_get_entrypoint(producer), true, false);
916 NIR_PASS_V(producer, nir_lower_global_vars_to_local);
917 NIR_PASS_V(producer, nir_split_var_copies);
918 NIR_PASS_V(producer, nir_lower_var_copies);
919 }
920 }
921
922 static bool
brw_nir_should_vectorize_mem(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high)923 brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
924 unsigned bit_size,
925 unsigned num_components,
926 nir_intrinsic_instr *low,
927 nir_intrinsic_instr *high)
928 {
929 /* Don't combine things to generate 64-bit loads/stores. We have to split
930 * those back into 32-bit ones anyway and UBO loads aren't split in NIR so
931 * we don't want to make a mess for the back-end.
932 */
933 if (bit_size > 32)
934 return false;
935
936 /* We can handle at most a vec4 right now. Anything bigger would get
937 * immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
938 */
939 if (num_components > 4)
940 return false;
941
942
943 uint32_t align;
944 if (align_offset)
945 align = 1 << (ffs(align_offset) - 1);
946 else
947 align = align_mul;
948
949 if (align < bit_size / 8)
950 return false;
951
952 return true;
953 }
954
955 static
combine_all_barriers(nir_intrinsic_instr * a,nir_intrinsic_instr * b,void * data)956 bool combine_all_barriers(nir_intrinsic_instr *a,
957 nir_intrinsic_instr *b,
958 void *data)
959 {
960 /* Translation to backend IR will get rid of modes we don't care about, so
961 * no harm in always combining them.
962 *
963 * TODO: While HW has only ACQUIRE|RELEASE fences, we could improve the
964 * scheduling so that it can take advantage of the different semantics.
965 */
966 nir_intrinsic_set_memory_modes(a, nir_intrinsic_memory_modes(a) |
967 nir_intrinsic_memory_modes(b));
968 nir_intrinsic_set_memory_semantics(a, nir_intrinsic_memory_semantics(a) |
969 nir_intrinsic_memory_semantics(b));
970 nir_intrinsic_set_memory_scope(a, MAX2(nir_intrinsic_memory_scope(a),
971 nir_intrinsic_memory_scope(b)));
972 return true;
973 }
974
975 static void
brw_vectorize_lower_mem_access(nir_shader * nir,const struct brw_compiler * compiler,bool is_scalar)976 brw_vectorize_lower_mem_access(nir_shader *nir,
977 const struct brw_compiler *compiler,
978 bool is_scalar)
979 {
980 const struct gen_device_info *devinfo = compiler->devinfo;
981 bool progress = false;
982
983 if (is_scalar) {
984 OPT(nir_opt_load_store_vectorize,
985 nir_var_mem_ubo | nir_var_mem_ssbo |
986 nir_var_mem_global | nir_var_mem_shared,
987 brw_nir_should_vectorize_mem,
988 (nir_variable_mode)0);
989 }
990
991 OPT(brw_nir_lower_mem_access_bit_sizes, devinfo);
992
993 while (progress) {
994 progress = false;
995
996 OPT(nir_lower_pack);
997 OPT(nir_copy_prop);
998 OPT(nir_opt_dce);
999 OPT(nir_opt_cse);
1000 OPT(nir_opt_algebraic);
1001 OPT(nir_opt_constant_folding);
1002 }
1003 }
1004
1005 static bool
nir_shader_has_local_variables(const nir_shader * nir)1006 nir_shader_has_local_variables(const nir_shader *nir)
1007 {
1008 nir_foreach_function(func, nir) {
1009 if (func->impl && !exec_list_is_empty(&func->impl->locals))
1010 return true;
1011 }
1012
1013 return false;
1014 }
1015
1016 /* Prepare the given shader for codegen
1017 *
1018 * This function is intended to be called right before going into the actual
1019 * backend and is highly backend-specific. Also, once this function has been
1020 * called on a shader, it will no longer be in SSA form so most optimizations
1021 * will not work.
1022 */
1023 void
brw_postprocess_nir(nir_shader * nir,const struct brw_compiler * compiler,bool is_scalar)1024 brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
1025 bool is_scalar)
1026 {
1027 const struct gen_device_info *devinfo = compiler->devinfo;
1028 bool debug_enabled =
1029 (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->info.stage));
1030
1031 UNUSED bool progress; /* Written by OPT */
1032
1033 OPT(brw_nir_lower_scoped_barriers);
1034 OPT(nir_opt_combine_memory_barriers, combine_all_barriers, NULL);
1035
1036 do {
1037 progress = false;
1038 OPT(nir_opt_algebraic_before_ffma);
1039 } while (progress);
1040
1041 brw_nir_optimize(nir, compiler, is_scalar, false);
1042
1043 if (is_scalar && nir_shader_has_local_variables(nir)) {
1044 OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
1045 glsl_get_natural_size_align_bytes);
1046 OPT(nir_lower_explicit_io, nir_var_function_temp,
1047 nir_address_format_32bit_offset);
1048 brw_nir_optimize(nir, compiler, is_scalar, false);
1049 }
1050
1051 brw_vectorize_lower_mem_access(nir, compiler, is_scalar);
1052
1053 if (OPT(nir_lower_int64))
1054 brw_nir_optimize(nir, compiler, is_scalar, false);
1055
1056 if (devinfo->gen >= 6) {
1057 /* Try and fuse multiply-adds */
1058 OPT(brw_nir_opt_peephole_ffma);
1059 }
1060
1061 if (OPT(nir_opt_comparison_pre)) {
1062 OPT(nir_copy_prop);
1063 OPT(nir_opt_dce);
1064 OPT(nir_opt_cse);
1065
1066 /* Do the select peepehole again. nir_opt_comparison_pre (combined with
1067 * the other optimization passes) will have removed at least one
1068 * instruction from one of the branches of the if-statement, so now it
1069 * might be under the threshold of conversion to bcsel.
1070 *
1071 * See brw_nir_optimize for the explanation of is_vec4_tessellation.
1072 */
1073 const bool is_vec4_tessellation = !is_scalar &&
1074 (nir->info.stage == MESA_SHADER_TESS_CTRL ||
1075 nir->info.stage == MESA_SHADER_TESS_EVAL);
1076 OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false);
1077 OPT(nir_opt_peephole_select, 1, is_vec4_tessellation,
1078 compiler->devinfo->gen >= 6);
1079 }
1080
1081 do {
1082 progress = false;
1083 if (OPT(nir_opt_algebraic_late)) {
1084 /* At this late stage, anything that makes more constants will wreak
1085 * havok on the vec4 backend. The handling of constants in the vec4
1086 * backend is not good.
1087 */
1088 if (is_scalar)
1089 OPT(nir_opt_constant_folding);
1090
1091 OPT(nir_copy_prop);
1092 OPT(nir_opt_dce);
1093 OPT(nir_opt_cse);
1094 }
1095 } while (progress);
1096
1097
1098 OPT(brw_nir_lower_conversions);
1099
1100 if (is_scalar)
1101 OPT(nir_lower_alu_to_scalar, NULL, NULL);
1102
1103 while (OPT(nir_opt_algebraic_distribute_src_mods)) {
1104 OPT(nir_copy_prop);
1105 OPT(nir_opt_dce);
1106 OPT(nir_opt_cse);
1107 }
1108
1109 OPT(nir_copy_prop);
1110 OPT(nir_opt_dce);
1111 OPT(nir_opt_move, nir_move_comparisons);
1112
1113 OPT(nir_lower_bool_to_int32);
1114 OPT(nir_copy_prop);
1115 OPT(nir_opt_dce);
1116
1117 OPT(nir_lower_locals_to_regs);
1118
1119 if (unlikely(debug_enabled)) {
1120 /* Re-index SSA defs so we print more sensible numbers. */
1121 nir_foreach_function(function, nir) {
1122 if (function->impl)
1123 nir_index_ssa_defs(function->impl);
1124 }
1125
1126 fprintf(stderr, "NIR (SSA form) for %s shader:\n",
1127 _mesa_shader_stage_to_string(nir->info.stage));
1128 nir_print_shader(nir, stderr);
1129 }
1130
1131 nir_validate_ssa_dominance(nir, "before nir_convert_from_ssa");
1132
1133 OPT(nir_convert_from_ssa, true);
1134
1135 if (!is_scalar) {
1136 OPT(nir_move_vec_src_uses_to_dest);
1137 OPT(nir_lower_vec_to_movs);
1138 }
1139
1140 OPT(nir_opt_dce);
1141
1142 if (OPT(nir_opt_rematerialize_compares))
1143 OPT(nir_opt_dce);
1144
1145 /* This is the last pass we run before we start emitting stuff. It
1146 * determines when we need to insert boolean resolves on Gen <= 5. We
1147 * run it last because it stashes data in instr->pass_flags and we don't
1148 * want that to be squashed by other NIR passes.
1149 */
1150 if (devinfo->gen <= 5)
1151 brw_nir_analyze_boolean_resolves(nir);
1152
1153 nir_sweep(nir);
1154
1155 if (unlikely(debug_enabled)) {
1156 fprintf(stderr, "NIR (final form) for %s shader:\n",
1157 _mesa_shader_stage_to_string(nir->info.stage));
1158 nir_print_shader(nir, stderr);
1159 }
1160 }
1161
1162 static bool
brw_nir_apply_sampler_key(nir_shader * nir,const struct brw_compiler * compiler,const struct brw_sampler_prog_key_data * key_tex)1163 brw_nir_apply_sampler_key(nir_shader *nir,
1164 const struct brw_compiler *compiler,
1165 const struct brw_sampler_prog_key_data *key_tex)
1166 {
1167 const struct gen_device_info *devinfo = compiler->devinfo;
1168 nir_lower_tex_options tex_options = {
1169 .lower_txd_clamp_bindless_sampler = true,
1170 .lower_txd_clamp_if_sampler_index_not_lt_16 = true,
1171 };
1172
1173 /* Iron Lake and prior require lowering of all rectangle textures */
1174 if (devinfo->gen < 6)
1175 tex_options.lower_rect = true;
1176
1177 /* Prior to Broadwell, our hardware can't actually do GL_CLAMP */
1178 if (devinfo->gen < 8) {
1179 tex_options.saturate_s = key_tex->gl_clamp_mask[0];
1180 tex_options.saturate_t = key_tex->gl_clamp_mask[1];
1181 tex_options.saturate_r = key_tex->gl_clamp_mask[2];
1182 }
1183
1184 /* Prior to Haswell, we have to fake texture swizzle */
1185 for (unsigned s = 0; s < MAX_SAMPLERS; s++) {
1186 if (key_tex->swizzles[s] == SWIZZLE_NOOP)
1187 continue;
1188
1189 tex_options.swizzle_result |= BITFIELD_BIT(s);
1190 for (unsigned c = 0; c < 4; c++)
1191 tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c);
1192 }
1193
1194 /* Prior to Haswell, we have to lower gradients on shadow samplers */
1195 tex_options.lower_txd_shadow = devinfo->gen < 8 && !devinfo->is_haswell;
1196
1197 tex_options.lower_y_uv_external = key_tex->y_uv_image_mask;
1198 tex_options.lower_y_u_v_external = key_tex->y_u_v_image_mask;
1199 tex_options.lower_yx_xuxv_external = key_tex->yx_xuxv_image_mask;
1200 tex_options.lower_xy_uxvx_external = key_tex->xy_uxvx_image_mask;
1201 tex_options.lower_ayuv_external = key_tex->ayuv_image_mask;
1202 tex_options.lower_xyuv_external = key_tex->xyuv_image_mask;
1203 tex_options.bt709_external = key_tex->bt709_mask;
1204 tex_options.bt2020_external = key_tex->bt2020_mask;
1205
1206 /* Setup array of scaling factors for each texture. */
1207 memcpy(&tex_options.scale_factors, &key_tex->scale_factors,
1208 sizeof(tex_options.scale_factors));
1209
1210 return nir_lower_tex(nir, &tex_options);
1211 }
1212
1213 static unsigned
get_subgroup_size(gl_shader_stage stage,const struct brw_base_prog_key * key,unsigned max_subgroup_size)1214 get_subgroup_size(gl_shader_stage stage,
1215 const struct brw_base_prog_key *key,
1216 unsigned max_subgroup_size)
1217 {
1218 switch (key->subgroup_size_type) {
1219 case BRW_SUBGROUP_SIZE_API_CONSTANT:
1220 /* We have to use the global constant size. */
1221 return BRW_SUBGROUP_SIZE;
1222
1223 case BRW_SUBGROUP_SIZE_UNIFORM:
1224 /* It has to be uniform across all invocations but can vary per stage
1225 * if we want. This gives us a bit more freedom.
1226 *
1227 * For compute, brw_nir_apply_key is called per-dispatch-width so this
1228 * is the actual subgroup size and not a maximum. However, we only
1229 * invoke one size of any given compute shader so it's still guaranteed
1230 * to be uniform across invocations.
1231 */
1232 return max_subgroup_size;
1233
1234 case BRW_SUBGROUP_SIZE_VARYING:
1235 /* The subgroup size is allowed to be fully varying. For geometry
1236 * stages, we know it's always 8 which is max_subgroup_size so we can
1237 * return that. For compute, brw_nir_apply_key is called once per
1238 * dispatch-width so max_subgroup_size is the real subgroup size.
1239 *
1240 * For fragment, we return 0 and let it fall through to the back-end
1241 * compiler. This means we can't optimize based on subgroup size but
1242 * that's a risk the client took when it asked for a varying subgroup
1243 * size.
1244 */
1245 return stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
1246
1247 case BRW_SUBGROUP_SIZE_REQUIRE_8:
1248 case BRW_SUBGROUP_SIZE_REQUIRE_16:
1249 case BRW_SUBGROUP_SIZE_REQUIRE_32:
1250 assert(stage == MESA_SHADER_COMPUTE);
1251 /* These enum values are expressly chosen to be equal to the subgroup
1252 * size that they require.
1253 */
1254 return key->subgroup_size_type;
1255 }
1256
1257 unreachable("Invalid subgroup size type");
1258 }
1259
1260 void
brw_nir_apply_key(nir_shader * nir,const struct brw_compiler * compiler,const struct brw_base_prog_key * key,unsigned max_subgroup_size,bool is_scalar)1261 brw_nir_apply_key(nir_shader *nir,
1262 const struct brw_compiler *compiler,
1263 const struct brw_base_prog_key *key,
1264 unsigned max_subgroup_size,
1265 bool is_scalar)
1266 {
1267 bool progress = false;
1268
1269 OPT(brw_nir_apply_sampler_key, compiler, &key->tex);
1270
1271 const nir_lower_subgroups_options subgroups_options = {
1272 .subgroup_size = get_subgroup_size(nir->info.stage, key,
1273 max_subgroup_size),
1274 .ballot_bit_size = 32,
1275 .lower_subgroup_masks = true,
1276 };
1277 OPT(nir_lower_subgroups, &subgroups_options);
1278
1279 if (progress)
1280 brw_nir_optimize(nir, compiler, is_scalar, false);
1281 }
1282
1283 enum brw_conditional_mod
brw_cmod_for_nir_comparison(nir_op op)1284 brw_cmod_for_nir_comparison(nir_op op)
1285 {
1286 switch (op) {
1287 case nir_op_flt:
1288 case nir_op_flt32:
1289 case nir_op_ilt:
1290 case nir_op_ilt32:
1291 case nir_op_ult:
1292 case nir_op_ult32:
1293 return BRW_CONDITIONAL_L;
1294
1295 case nir_op_fge:
1296 case nir_op_fge32:
1297 case nir_op_ige:
1298 case nir_op_ige32:
1299 case nir_op_uge:
1300 case nir_op_uge32:
1301 return BRW_CONDITIONAL_GE;
1302
1303 case nir_op_feq:
1304 case nir_op_feq32:
1305 case nir_op_ieq:
1306 case nir_op_ieq32:
1307 case nir_op_b32all_fequal2:
1308 case nir_op_b32all_iequal2:
1309 case nir_op_b32all_fequal3:
1310 case nir_op_b32all_iequal3:
1311 case nir_op_b32all_fequal4:
1312 case nir_op_b32all_iequal4:
1313 return BRW_CONDITIONAL_Z;
1314
1315 case nir_op_fneu:
1316 case nir_op_fneu32:
1317 case nir_op_ine:
1318 case nir_op_ine32:
1319 case nir_op_b32any_fnequal2:
1320 case nir_op_b32any_inequal2:
1321 case nir_op_b32any_fnequal3:
1322 case nir_op_b32any_inequal3:
1323 case nir_op_b32any_fnequal4:
1324 case nir_op_b32any_inequal4:
1325 return BRW_CONDITIONAL_NZ;
1326
1327 default:
1328 unreachable("Unsupported NIR comparison op");
1329 }
1330 }
1331
1332 uint32_t
brw_aop_for_nir_intrinsic(const nir_intrinsic_instr * atomic)1333 brw_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic)
1334 {
1335 switch (atomic->intrinsic) {
1336 #define AOP_CASE(atom) \
1337 case nir_intrinsic_image_atomic_##atom: \
1338 case nir_intrinsic_bindless_image_atomic_##atom: \
1339 case nir_intrinsic_ssbo_atomic_##atom: \
1340 case nir_intrinsic_shared_atomic_##atom: \
1341 case nir_intrinsic_global_atomic_##atom
1342
1343 AOP_CASE(add): {
1344 unsigned src_idx;
1345 switch (atomic->intrinsic) {
1346 case nir_intrinsic_image_atomic_add:
1347 case nir_intrinsic_bindless_image_atomic_add:
1348 src_idx = 3;
1349 break;
1350 case nir_intrinsic_ssbo_atomic_add:
1351 src_idx = 2;
1352 break;
1353 case nir_intrinsic_shared_atomic_add:
1354 case nir_intrinsic_global_atomic_add:
1355 src_idx = 1;
1356 break;
1357 default:
1358 unreachable("Invalid add atomic opcode");
1359 }
1360
1361 if (nir_src_is_const(atomic->src[src_idx])) {
1362 int64_t add_val = nir_src_as_int(atomic->src[src_idx]);
1363 if (add_val == 1)
1364 return BRW_AOP_INC;
1365 else if (add_val == -1)
1366 return BRW_AOP_DEC;
1367 }
1368 return BRW_AOP_ADD;
1369 }
1370
1371 AOP_CASE(imin): return BRW_AOP_IMIN;
1372 AOP_CASE(umin): return BRW_AOP_UMIN;
1373 AOP_CASE(imax): return BRW_AOP_IMAX;
1374 AOP_CASE(umax): return BRW_AOP_UMAX;
1375 AOP_CASE(and): return BRW_AOP_AND;
1376 AOP_CASE(or): return BRW_AOP_OR;
1377 AOP_CASE(xor): return BRW_AOP_XOR;
1378 AOP_CASE(exchange): return BRW_AOP_MOV;
1379 AOP_CASE(comp_swap): return BRW_AOP_CMPWR;
1380
1381 #undef AOP_CASE
1382 #define AOP_CASE(atom) \
1383 case nir_intrinsic_ssbo_atomic_##atom: \
1384 case nir_intrinsic_shared_atomic_##atom: \
1385 case nir_intrinsic_global_atomic_##atom
1386
1387 AOP_CASE(fmin): return BRW_AOP_FMIN;
1388 AOP_CASE(fmax): return BRW_AOP_FMAX;
1389 AOP_CASE(fcomp_swap): return BRW_AOP_FCMPWR;
1390
1391 #undef AOP_CASE
1392
1393 default:
1394 unreachable("Unsupported NIR atomic intrinsic");
1395 }
1396 }
1397
1398 enum brw_reg_type
brw_type_for_nir_type(const struct gen_device_info * devinfo,nir_alu_type type)1399 brw_type_for_nir_type(const struct gen_device_info *devinfo, nir_alu_type type)
1400 {
1401 switch (type) {
1402 case nir_type_uint:
1403 case nir_type_uint32:
1404 return BRW_REGISTER_TYPE_UD;
1405 case nir_type_bool:
1406 case nir_type_int:
1407 case nir_type_bool32:
1408 case nir_type_int32:
1409 return BRW_REGISTER_TYPE_D;
1410 case nir_type_float:
1411 case nir_type_float32:
1412 return BRW_REGISTER_TYPE_F;
1413 case nir_type_float16:
1414 return BRW_REGISTER_TYPE_HF;
1415 case nir_type_float64:
1416 return BRW_REGISTER_TYPE_DF;
1417 case nir_type_int64:
1418 return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_Q;
1419 case nir_type_uint64:
1420 return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_UQ;
1421 case nir_type_int16:
1422 return BRW_REGISTER_TYPE_W;
1423 case nir_type_uint16:
1424 return BRW_REGISTER_TYPE_UW;
1425 case nir_type_int8:
1426 return BRW_REGISTER_TYPE_B;
1427 case nir_type_uint8:
1428 return BRW_REGISTER_TYPE_UB;
1429 default:
1430 unreachable("unknown type");
1431 }
1432
1433 return BRW_REGISTER_TYPE_F;
1434 }
1435
1436 /* Returns the glsl_base_type corresponding to a nir_alu_type.
1437 * This is used by both brw_vec4_nir and brw_fs_nir.
1438 */
1439 enum glsl_base_type
brw_glsl_base_type_for_nir_type(nir_alu_type type)1440 brw_glsl_base_type_for_nir_type(nir_alu_type type)
1441 {
1442 switch (type) {
1443 case nir_type_float:
1444 case nir_type_float32:
1445 return GLSL_TYPE_FLOAT;
1446
1447 case nir_type_float16:
1448 return GLSL_TYPE_FLOAT16;
1449
1450 case nir_type_float64:
1451 return GLSL_TYPE_DOUBLE;
1452
1453 case nir_type_int:
1454 case nir_type_int32:
1455 return GLSL_TYPE_INT;
1456
1457 case nir_type_uint:
1458 case nir_type_uint32:
1459 return GLSL_TYPE_UINT;
1460
1461 case nir_type_int16:
1462 return GLSL_TYPE_INT16;
1463
1464 case nir_type_uint16:
1465 return GLSL_TYPE_UINT16;
1466
1467 default:
1468 unreachable("bad type");
1469 }
1470 }
1471
1472 nir_shader *
brw_nir_create_passthrough_tcs(void * mem_ctx,const struct brw_compiler * compiler,const nir_shader_compiler_options * options,const struct brw_tcs_prog_key * key)1473 brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compiler,
1474 const nir_shader_compiler_options *options,
1475 const struct brw_tcs_prog_key *key)
1476 {
1477 nir_builder b;
1478 nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_TESS_CTRL,
1479 options);
1480 nir_shader *nir = b.shader;
1481 nir_variable *var;
1482 nir_intrinsic_instr *load;
1483 nir_intrinsic_instr *store;
1484 nir_ssa_def *zero = nir_imm_int(&b, 0);
1485 nir_ssa_def *invoc_id = nir_load_invocation_id(&b);
1486
1487 nir->info.inputs_read = key->outputs_written &
1488 ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
1489 nir->info.outputs_written = key->outputs_written;
1490 nir->info.tess.tcs_vertices_out = key->input_vertices;
1491 nir->info.name = ralloc_strdup(nir, "passthrough");
1492 nir->num_uniforms = 8 * sizeof(uint32_t);
1493
1494 var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_0");
1495 var->data.location = 0;
1496 var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_1");
1497 var->data.location = 1;
1498
1499 /* Write the patch URB header. */
1500 for (int i = 0; i <= 1; i++) {
1501 load = nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
1502 load->num_components = 4;
1503 load->src[0] = nir_src_for_ssa(zero);
1504 nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
1505 nir_intrinsic_set_base(load, i * 4 * sizeof(uint32_t));
1506 nir_builder_instr_insert(&b, &load->instr);
1507
1508 store = nir_intrinsic_instr_create(nir, nir_intrinsic_store_output);
1509 store->num_components = 4;
1510 store->src[0] = nir_src_for_ssa(&load->dest.ssa);
1511 store->src[1] = nir_src_for_ssa(zero);
1512 nir_intrinsic_set_base(store, VARYING_SLOT_TESS_LEVEL_INNER - i);
1513 nir_intrinsic_set_write_mask(store, WRITEMASK_XYZW);
1514 nir_builder_instr_insert(&b, &store->instr);
1515 }
1516
1517 /* Copy inputs to outputs. */
1518 uint64_t varyings = nir->info.inputs_read;
1519
1520 while (varyings != 0) {
1521 const int varying = ffsll(varyings) - 1;
1522
1523 load = nir_intrinsic_instr_create(nir,
1524 nir_intrinsic_load_per_vertex_input);
1525 load->num_components = 4;
1526 load->src[0] = nir_src_for_ssa(invoc_id);
1527 load->src[1] = nir_src_for_ssa(zero);
1528 nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
1529 nir_intrinsic_set_base(load, varying);
1530 nir_builder_instr_insert(&b, &load->instr);
1531
1532 store = nir_intrinsic_instr_create(nir,
1533 nir_intrinsic_store_per_vertex_output);
1534 store->num_components = 4;
1535 store->src[0] = nir_src_for_ssa(&load->dest.ssa);
1536 store->src[1] = nir_src_for_ssa(invoc_id);
1537 store->src[2] = nir_src_for_ssa(zero);
1538 nir_intrinsic_set_base(store, varying);
1539 nir_intrinsic_set_write_mask(store, WRITEMASK_XYZW);
1540 nir_builder_instr_insert(&b, &store->instr);
1541
1542 varyings &= ~BITFIELD64_BIT(varying);
1543 }
1544
1545 nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs");
1546
1547 brw_preprocess_nir(compiler, nir, NULL);
1548
1549 return nir;
1550 }
1551