1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "intel_nir.h"
25 #include "brw_nir.h"
26 #include "compiler/glsl_types.h"
27 #include "compiler/nir/nir_builder.h"
28
29 /*
30 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
31 * false) elements needed to pack a type.
32 */
33 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)34 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
35 {
36 unsigned int i;
37 int size;
38
39 switch (type->base_type) {
40 case GLSL_TYPE_UINT:
41 case GLSL_TYPE_INT:
42 case GLSL_TYPE_FLOAT:
43 case GLSL_TYPE_FLOAT16:
44 case GLSL_TYPE_BOOL:
45 case GLSL_TYPE_DOUBLE:
46 case GLSL_TYPE_UINT16:
47 case GLSL_TYPE_INT16:
48 case GLSL_TYPE_UINT8:
49 case GLSL_TYPE_INT8:
50 case GLSL_TYPE_UINT64:
51 case GLSL_TYPE_INT64:
52 if (glsl_type_is_matrix(type)) {
53 const glsl_type *col_type = glsl_get_column_type(type);
54 unsigned col_slots =
55 (as_vec4 && glsl_type_is_dual_slot(col_type)) ? 2 : 1;
56 return type->matrix_columns * col_slots;
57 } else {
58 /* Regardless of size of vector, it gets a vec4. This is bad
59 * packing for things like floats, but otherwise arrays become a
60 * mess. Hopefully a later pass over the code can pack scalars
61 * down if appropriate.
62 */
63 return (as_vec4 && glsl_type_is_dual_slot(type)) ? 2 : 1;
64 }
65 case GLSL_TYPE_ARRAY:
66 assert(type->length > 0);
67 return type_size_xvec4(type->fields.array, as_vec4, bindless) *
68 type->length;
69 case GLSL_TYPE_STRUCT:
70 case GLSL_TYPE_INTERFACE:
71 size = 0;
72 for (i = 0; i < type->length; i++) {
73 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
74 bindless);
75 }
76 return size;
77 case GLSL_TYPE_SUBROUTINE:
78 return 1;
79
80 case GLSL_TYPE_SAMPLER:
81 case GLSL_TYPE_TEXTURE:
82 /* Samplers and textures take up no register space, since they're baked
83 * in at link time.
84 */
85 return bindless ? 1 : 0;
86 case GLSL_TYPE_ATOMIC_UINT:
87 return 0;
88 case GLSL_TYPE_IMAGE:
89 return bindless ? 1 : 0;
90 case GLSL_TYPE_VOID:
91 case GLSL_TYPE_ERROR:
92 case GLSL_TYPE_COOPERATIVE_MATRIX:
93 unreachable("not reached");
94 }
95
96 return 0;
97 }
98
99 /**
100 * Returns the minimum number of vec4 elements needed to pack a type.
101 *
102 * For simple types, it will return 1 (a single vec4); for matrices, the
103 * number of columns; for array and struct, the sum of the vec4_size of
104 * each of its elements; and for sampler and atomic, zero.
105 *
106 * This method is useful to calculate how much register space is needed to
107 * store a particular type.
108 */
109 int
type_size_vec4(const struct glsl_type * type,bool bindless)110 type_size_vec4(const struct glsl_type *type, bool bindless)
111 {
112 return type_size_xvec4(type, true, bindless);
113 }
114
115 /**
116 * Returns the minimum number of dvec4 elements needed to pack a type.
117 *
118 * For simple types, it will return 1 (a single dvec4); for matrices, the
119 * number of columns; for array and struct, the sum of the dvec4_size of
120 * each of its elements; and for sampler and atomic, zero.
121 *
122 * This method is useful to calculate how much register space is needed to
123 * store a particular type.
124 *
125 * Measuring double-precision vertex inputs as dvec4 is required because
126 * ARB_vertex_attrib_64bit states that these uses the same number of locations
127 * than the single-precision version. That is, two consecutives dvec4 would be
128 * located in location "x" and location "x+1", not "x+2".
129 *
130 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
131 * remap_vs_attrs() will take in account both the location and also if the
132 * type fits in one or two vec4 slots.
133 */
134 int
type_size_dvec4(const struct glsl_type * type,bool bindless)135 type_size_dvec4(const struct glsl_type *type, bool bindless)
136 {
137 return type_size_xvec4(type, false, bindless);
138 }
139
140 static bool
remap_tess_levels(nir_builder * b,nir_intrinsic_instr * intr,enum tess_primitive_mode _primitive_mode)141 remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr,
142 enum tess_primitive_mode _primitive_mode)
143 {
144 const int location = nir_intrinsic_base(intr);
145 const unsigned component = nir_intrinsic_component(intr);
146 bool out_of_bounds = false;
147 bool write = !nir_intrinsic_infos[intr->intrinsic].has_dest;
148 unsigned mask = write ? nir_intrinsic_write_mask(intr) : 0;
149 nir_def *src = NULL, *dest = NULL;
150
151 if (write) {
152 assert(intr->num_components == intr->src[0].ssa->num_components);
153 } else {
154 assert(intr->num_components == intr->def.num_components);
155 }
156
157 if (location == VARYING_SLOT_TESS_LEVEL_INNER) {
158 b->cursor = write ? nir_before_instr(&intr->instr)
159 : nir_after_instr(&intr->instr);
160
161 switch (_primitive_mode) {
162 case TESS_PRIMITIVE_QUADS:
163 /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */
164 nir_intrinsic_set_base(intr, 0);
165
166 if (write) {
167 assert(intr->src[0].ssa->num_components == 2);
168
169 intr->num_components = 4;
170
171 nir_def *undef = nir_undef(b, 1, 32);
172 nir_def *x = nir_channel(b, intr->src[0].ssa, 0);
173 nir_def *y = nir_channel(b, intr->src[0].ssa, 1);
174 src = nir_vec4(b, undef, undef, y, x);
175 mask = !!(mask & WRITEMASK_X) << 3 | !!(mask & WRITEMASK_Y) << 2;
176 } else if (intr->def.num_components > 1) {
177 assert(intr->def.num_components == 2);
178
179 intr->num_components = 4;
180 intr->def.num_components = 4;
181
182 unsigned wz[2] = { 3, 2 };
183 dest = nir_swizzle(b, &intr->def, wz, 2);
184 } else {
185 nir_intrinsic_set_component(intr, 3 - component);
186 }
187 break;
188 case TESS_PRIMITIVE_TRIANGLES:
189 /* gl_TessLevelInner[0] lives at DWord 4. */
190 nir_intrinsic_set_base(intr, 1);
191 mask &= WRITEMASK_X;
192 out_of_bounds = component > 0;
193 break;
194 case TESS_PRIMITIVE_ISOLINES:
195 out_of_bounds = true;
196 break;
197 default:
198 unreachable("Bogus tessellation domain");
199 }
200 } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) {
201 b->cursor = write ? nir_before_instr(&intr->instr)
202 : nir_after_instr(&intr->instr);
203
204 nir_intrinsic_set_base(intr, 1);
205
206 switch (_primitive_mode) {
207 case TESS_PRIMITIVE_QUADS:
208 case TESS_PRIMITIVE_TRIANGLES:
209 /* Quads: gl_TessLevelOuter[0..3] lives at DWords 7-4 (reversed).
210 * Triangles: gl_TessLevelOuter[0..2] lives at DWords 7-5 (reversed).
211 */
212 if (write) {
213 assert(intr->src[0].ssa->num_components == 4);
214
215 unsigned wzyx[4] = { 3, 2, 1, 0 };
216 src = nir_swizzle(b, intr->src[0].ssa, wzyx, 4);
217 mask = !!(mask & WRITEMASK_X) << 3 | !!(mask & WRITEMASK_Y) << 2 |
218 !!(mask & WRITEMASK_Z) << 1 | !!(mask & WRITEMASK_W) << 0;
219
220 /* Don't overwrite the inner factor at DWord 4 for triangles */
221 if (_primitive_mode == TESS_PRIMITIVE_TRIANGLES)
222 mask &= ~WRITEMASK_X;
223 } else if (intr->def.num_components > 1) {
224 assert(intr->def.num_components == 4);
225
226 unsigned wzyx[4] = { 3, 2, 1, 0 };
227 dest = nir_swizzle(b, &intr->def, wzyx, 4);
228 } else {
229 nir_intrinsic_set_component(intr, 3 - component);
230 out_of_bounds = component == 3 &&
231 _primitive_mode == TESS_PRIMITIVE_TRIANGLES;
232 }
233 break;
234 case TESS_PRIMITIVE_ISOLINES:
235 /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */
236 if (write) {
237 assert(intr->src[0].ssa->num_components == 4);
238
239 nir_def *undef = nir_undef(b, 1, 32);
240 nir_def *x = nir_channel(b, intr->src[0].ssa, 0);
241 nir_def *y = nir_channel(b, intr->src[0].ssa, 1);
242 src = nir_vec4(b, undef, undef, x, y);
243 mask = !!(mask & WRITEMASK_X) << 2 | !!(mask & WRITEMASK_Y) << 3;
244 } else {
245 nir_intrinsic_set_component(intr, 2 + component);
246 out_of_bounds = component > 1;
247 }
248 break;
249 default:
250 unreachable("Bogus tessellation domain");
251 }
252 } else {
253 return false;
254 }
255
256 if (out_of_bounds) {
257 if (!write)
258 nir_def_rewrite_uses(&intr->def, nir_undef(b, 1, 32));
259 nir_instr_remove(&intr->instr);
260 } else if (write) {
261 nir_intrinsic_set_write_mask(intr, mask);
262
263 if (src) {
264 nir_src_rewrite(&intr->src[0], src);
265 }
266 } else if (dest) {
267 nir_def_rewrite_uses_after(&intr->def, dest,
268 dest->parent_instr);
269 }
270
271 return true;
272 }
273
274 static bool
is_input(nir_intrinsic_instr * intrin)275 is_input(nir_intrinsic_instr *intrin)
276 {
277 return intrin->intrinsic == nir_intrinsic_load_input ||
278 intrin->intrinsic == nir_intrinsic_load_per_primitive_input ||
279 intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
280 intrin->intrinsic == nir_intrinsic_load_interpolated_input;
281 }
282
283 static bool
is_output(nir_intrinsic_instr * intrin)284 is_output(nir_intrinsic_instr *intrin)
285 {
286 return intrin->intrinsic == nir_intrinsic_load_output ||
287 intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
288 intrin->intrinsic == nir_intrinsic_load_per_view_output ||
289 intrin->intrinsic == nir_intrinsic_store_output ||
290 intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
291 intrin->intrinsic == nir_intrinsic_store_per_view_output;
292 }
293
294
295 static bool
remap_patch_urb_offsets(nir_block * block,nir_builder * b,const struct intel_vue_map * vue_map,enum tess_primitive_mode tes_primitive_mode)296 remap_patch_urb_offsets(nir_block *block, nir_builder *b,
297 const struct intel_vue_map *vue_map,
298 enum tess_primitive_mode tes_primitive_mode)
299 {
300 nir_foreach_instr_safe(instr, block) {
301 if (instr->type != nir_instr_type_intrinsic)
302 continue;
303
304 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
305
306 gl_shader_stage stage = b->shader->info.stage;
307
308 if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
309 (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
310
311 if (remap_tess_levels(b, intrin, tes_primitive_mode))
312 continue;
313
314 int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
315 assert(vue_slot != -1);
316 intrin->const_index[0] = vue_slot;
317
318 nir_src *vertex = nir_get_io_arrayed_index_src(intrin);
319 if (vertex) {
320 if (nir_src_is_const(*vertex)) {
321 intrin->const_index[0] += nir_src_as_uint(*vertex) *
322 vue_map->num_per_vertex_slots;
323 } else {
324 b->cursor = nir_before_instr(&intrin->instr);
325
326 /* Multiply by the number of per-vertex slots. */
327 nir_def *vertex_offset =
328 nir_imul(b,
329 vertex->ssa,
330 nir_imm_int(b,
331 vue_map->num_per_vertex_slots));
332
333 /* Add it to the existing offset */
334 nir_src *offset = nir_get_io_offset_src(intrin);
335 nir_def *total_offset =
336 nir_iadd(b, vertex_offset,
337 offset->ssa);
338
339 nir_src_rewrite(offset, total_offset);
340 }
341 }
342 }
343 }
344 return true;
345 }
346
347 /* Replace store_per_view_output to plain store_output, mapping the view index
348 * to IO offset. Because we only use per-view outputs for position, the offset
349 * pitch is always 1. */
350 static bool
lower_per_view_outputs(nir_builder * b,nir_intrinsic_instr * intrin,UNUSED void * cb_data)351 lower_per_view_outputs(nir_builder *b,
352 nir_intrinsic_instr *intrin,
353 UNUSED void *cb_data)
354 {
355 if (intrin->intrinsic != nir_intrinsic_store_per_view_output &&
356 intrin->intrinsic != nir_intrinsic_load_per_view_output)
357 return false;
358
359 b->cursor = nir_before_instr(&intrin->instr);
360
361 nir_src *view_index = nir_get_io_arrayed_index_src(intrin);
362 nir_src *offset = nir_get_io_offset_src(intrin);
363
364 nir_def *new_offset = nir_iadd(b, view_index->ssa, offset->ssa);
365
366 nir_intrinsic_instr *new;
367 if (intrin->intrinsic == nir_intrinsic_store_per_view_output)
368 new = nir_store_output(b, intrin->src[0].ssa, new_offset);
369 else {
370 nir_def *new_def = nir_load_output(b, intrin->def.num_components,
371 intrin->def.bit_size, new_offset);
372 new = nir_instr_as_intrinsic(new_def->parent_instr);
373 }
374
375 nir_intrinsic_set_base(new, nir_intrinsic_base(intrin));
376 nir_intrinsic_set_range(new, nir_intrinsic_range(intrin));
377 nir_intrinsic_set_write_mask(new, nir_intrinsic_write_mask(intrin));
378 nir_intrinsic_set_component(new, nir_intrinsic_component(intrin));
379 nir_intrinsic_set_src_type(new, nir_intrinsic_src_type(intrin));
380 nir_intrinsic_set_io_semantics(new, nir_intrinsic_io_semantics(intrin));
381
382 if (intrin->intrinsic == nir_intrinsic_load_per_view_output)
383 nir_def_rewrite_uses(&intrin->def, &new->def);
384 nir_instr_remove(&intrin->instr);
385
386 return true;
387 }
388
389 static bool
brw_nir_lower_per_view_outputs(nir_shader * nir)390 brw_nir_lower_per_view_outputs(nir_shader *nir)
391 {
392 return nir_shader_intrinsics_pass(nir, lower_per_view_outputs,
393 nir_metadata_control_flow,
394 NULL);
395 }
396
397 void
brw_nir_lower_vs_inputs(nir_shader * nir)398 brw_nir_lower_vs_inputs(nir_shader *nir)
399 {
400 /* Start with the location of the variable's base. */
401 nir_foreach_shader_in_variable(var, nir)
402 var->data.driver_location = var->data.location;
403
404 /* Now use nir_lower_io to walk dereference chains. Attribute arrays are
405 * loaded as one vec4 or dvec4 per element (or matrix column), depending on
406 * whether it is a double-precision type or not.
407 */
408 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
409 nir_lower_io_lower_64bit_to_32);
410
411 /* This pass needs actual constants */
412 nir_opt_constant_folding(nir);
413
414 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
415
416 /* The last step is to remap VERT_ATTRIB_* to actual registers */
417
418 /* Whether or not we have any system generated values. gl_DrawID is not
419 * included here as it lives in its own vec4.
420 */
421 const bool has_sgvs =
422 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
423 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
424 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
425 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
426
427 const unsigned num_inputs = util_bitcount64(nir->info.inputs_read);
428
429 nir_foreach_function_impl(impl, nir) {
430 nir_builder b = nir_builder_create(impl);
431
432 nir_foreach_block(block, impl) {
433 nir_foreach_instr_safe(instr, block) {
434 if (instr->type != nir_instr_type_intrinsic)
435 continue;
436
437 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
438
439 switch (intrin->intrinsic) {
440 case nir_intrinsic_load_first_vertex:
441 case nir_intrinsic_load_base_instance:
442 case nir_intrinsic_load_vertex_id_zero_base:
443 case nir_intrinsic_load_instance_id:
444 case nir_intrinsic_load_is_indexed_draw:
445 case nir_intrinsic_load_draw_id: {
446 b.cursor = nir_after_instr(&intrin->instr);
447
448 /* gl_VertexID and friends are stored by the VF as the last
449 * vertex element. We convert them to load_input intrinsics at
450 * the right location.
451 */
452 nir_intrinsic_instr *load =
453 nir_intrinsic_instr_create(nir, nir_intrinsic_load_input);
454 load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
455
456 nir_intrinsic_set_base(load, num_inputs);
457 switch (intrin->intrinsic) {
458 case nir_intrinsic_load_first_vertex:
459 nir_intrinsic_set_component(load, 0);
460 break;
461 case nir_intrinsic_load_base_instance:
462 nir_intrinsic_set_component(load, 1);
463 break;
464 case nir_intrinsic_load_vertex_id_zero_base:
465 nir_intrinsic_set_component(load, 2);
466 break;
467 case nir_intrinsic_load_instance_id:
468 nir_intrinsic_set_component(load, 3);
469 break;
470 case nir_intrinsic_load_draw_id:
471 case nir_intrinsic_load_is_indexed_draw:
472 /* gl_DrawID and IsIndexedDraw are stored right after
473 * gl_VertexID and friends if any of them exist.
474 */
475 nir_intrinsic_set_base(load, num_inputs + has_sgvs);
476 if (intrin->intrinsic == nir_intrinsic_load_draw_id)
477 nir_intrinsic_set_component(load, 0);
478 else
479 nir_intrinsic_set_component(load, 1);
480 break;
481 default:
482 unreachable("Invalid system value intrinsic");
483 }
484
485 load->num_components = 1;
486 nir_def_init(&load->instr, &load->def, 1, 32);
487 nir_builder_instr_insert(&b, &load->instr);
488
489 nir_def_replace(&intrin->def, &load->def);
490 break;
491 }
492
493 case nir_intrinsic_load_input: {
494 /* Attributes come in a contiguous block, ordered by their
495 * gl_vert_attrib value. That means we can compute the slot
496 * number for an attribute by masking out the enabled attributes
497 * before it and counting the bits.
498 */
499 int attr = nir_intrinsic_base(intrin);
500 int slot = util_bitcount64(nir->info.inputs_read &
501 BITFIELD64_MASK(attr));
502 nir_intrinsic_set_base(intrin, slot);
503 break;
504 }
505
506 default:
507 break; /* Nothing to do */
508 }
509 }
510 }
511 }
512 }
513
514 void
brw_nir_lower_vue_inputs(nir_shader * nir,const struct intel_vue_map * vue_map)515 brw_nir_lower_vue_inputs(nir_shader *nir,
516 const struct intel_vue_map *vue_map)
517 {
518 nir_foreach_shader_in_variable(var, nir)
519 var->data.driver_location = var->data.location;
520
521 /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
522 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
523 nir_lower_io_lower_64bit_to_32);
524
525 /* This pass needs actual constants */
526 nir_opt_constant_folding(nir);
527
528 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
529
530 nir_foreach_function_impl(impl, nir) {
531 nir_foreach_block(block, impl) {
532 nir_foreach_instr(instr, block) {
533 if (instr->type != nir_instr_type_intrinsic)
534 continue;
535
536 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
537
538 if (intrin->intrinsic == nir_intrinsic_load_input ||
539 intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
540 /* Offset 0 is the VUE header, which contains
541 * VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and
542 * VARYING_SLOT_PSIZ [.w].
543 */
544 int varying = nir_intrinsic_base(intrin);
545 int vue_slot;
546 switch (varying) {
547 case VARYING_SLOT_PSIZ:
548 nir_intrinsic_set_base(intrin, 0);
549 nir_intrinsic_set_component(intrin, 3);
550 break;
551
552 default:
553 vue_slot = vue_map->varying_to_slot[varying];
554 assert(vue_slot != -1);
555 nir_intrinsic_set_base(intrin, vue_slot);
556 break;
557 }
558 }
559 }
560 }
561 }
562 }
563
564 void
brw_nir_lower_tes_inputs(nir_shader * nir,const struct intel_vue_map * vue_map)565 brw_nir_lower_tes_inputs(nir_shader *nir, const struct intel_vue_map *vue_map)
566 {
567 nir_foreach_shader_in_variable(var, nir)
568 var->data.driver_location = var->data.location;
569
570 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
571 nir_lower_io_lower_64bit_to_32);
572
573 /* This pass needs actual constants */
574 nir_opt_constant_folding(nir);
575
576 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
577
578 nir_foreach_function_impl(impl, nir) {
579 nir_builder b = nir_builder_create(impl);
580 nir_foreach_block(block, impl) {
581 remap_patch_urb_offsets(block, &b, vue_map,
582 nir->info.tess._primitive_mode);
583 }
584 }
585 }
586
587 static bool
lower_barycentric_per_sample(nir_builder * b,nir_intrinsic_instr * intrin,UNUSED void * cb_data)588 lower_barycentric_per_sample(nir_builder *b,
589 nir_intrinsic_instr *intrin,
590 UNUSED void *cb_data)
591 {
592 if (intrin->intrinsic != nir_intrinsic_load_barycentric_pixel &&
593 intrin->intrinsic != nir_intrinsic_load_barycentric_centroid)
594 return false;
595
596 b->cursor = nir_before_instr(&intrin->instr);
597 nir_def *centroid =
598 nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample,
599 nir_intrinsic_interp_mode(intrin));
600 nir_def_replace(&intrin->def, centroid);
601 return true;
602 }
603
604 /**
605 * Convert interpolateAtOffset() offsets from [-0.5, +0.5] floating point
606 * offsets to integer [-8, +7] offsets (in units of 1/16th of a pixel).
607 *
608 * We clamp to +7/16 on the upper end of the range, since +0.5 isn't
609 * representable in a S0.4 value; a naive conversion would give us -8/16,
610 * which is the opposite of what was intended.
611 *
612 * This is allowed by GL_ARB_gpu_shader5's quantization rules:
613 *
614 * "Not all values of <offset> may be supported; x and y offsets may
615 * be rounded to fixed-point values with the number of fraction bits
616 * given by the implementation-dependent constant
617 * FRAGMENT_INTERPOLATION_OFFSET_BITS."
618 */
619 static bool
lower_barycentric_at_offset(nir_builder * b,nir_intrinsic_instr * intrin,void * data)620 lower_barycentric_at_offset(nir_builder *b, nir_intrinsic_instr *intrin,
621 void *data)
622 {
623 if (intrin->intrinsic != nir_intrinsic_load_barycentric_at_offset)
624 return false;
625
626 b->cursor = nir_before_instr(&intrin->instr);
627
628 assert(intrin->src[0].ssa);
629 nir_def *offset =
630 nir_imin(b, nir_imm_int(b, 7),
631 nir_f2i32(b, nir_fmul_imm(b, intrin->src[0].ssa, 16)));
632
633 nir_src_rewrite(&intrin->src[0], offset);
634
635 return true;
636 }
637
638 void
brw_nir_lower_fs_inputs(nir_shader * nir,const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key)639 brw_nir_lower_fs_inputs(nir_shader *nir,
640 const struct intel_device_info *devinfo,
641 const struct brw_wm_prog_key *key)
642 {
643 nir_foreach_shader_in_variable(var, nir) {
644 var->data.driver_location = var->data.location;
645
646 /* Apply default interpolation mode.
647 *
648 * Everything defaults to smooth except for the legacy GL color
649 * built-in variables, which might be flat depending on API state.
650 */
651 if (var->data.interpolation == INTERP_MODE_NONE) {
652 const bool flat = key->flat_shade &&
653 (var->data.location == VARYING_SLOT_COL0 ||
654 var->data.location == VARYING_SLOT_COL1);
655
656 var->data.interpolation = flat ? INTERP_MODE_FLAT
657 : INTERP_MODE_SMOOTH;
658 }
659 }
660
661 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
662 nir_lower_io_lower_64bit_to_32 |
663 nir_lower_io_use_interpolated_input_intrinsics);
664 if (devinfo->ver >= 11)
665 nir_lower_interpolation(nir, ~0);
666
667 if (key->multisample_fbo == INTEL_NEVER) {
668 nir_lower_single_sampled(nir);
669 } else if (key->persample_interp == INTEL_ALWAYS) {
670 nir_shader_intrinsics_pass(nir, lower_barycentric_per_sample,
671 nir_metadata_control_flow,
672 NULL);
673 }
674
675 if (devinfo->ver < 20)
676 nir_shader_intrinsics_pass(nir, lower_barycentric_at_offset,
677 nir_metadata_control_flow,
678 NULL);
679
680 /* This pass needs actual constants */
681 nir_opt_constant_folding(nir);
682
683 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
684 }
685
686 void
brw_nir_lower_vue_outputs(nir_shader * nir)687 brw_nir_lower_vue_outputs(nir_shader *nir)
688 {
689 nir_foreach_shader_out_variable(var, nir) {
690 var->data.driver_location = var->data.location;
691 }
692
693 nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
694 nir_lower_io_lower_64bit_to_32);
695 brw_nir_lower_per_view_outputs(nir);
696 }
697
698 void
brw_nir_lower_tcs_outputs(nir_shader * nir,const struct intel_vue_map * vue_map,enum tess_primitive_mode tes_primitive_mode)699 brw_nir_lower_tcs_outputs(nir_shader *nir, const struct intel_vue_map *vue_map,
700 enum tess_primitive_mode tes_primitive_mode)
701 {
702 nir_foreach_shader_out_variable(var, nir) {
703 var->data.driver_location = var->data.location;
704 }
705
706 nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
707 nir_lower_io_lower_64bit_to_32);
708
709 /* This pass needs actual constants */
710 nir_opt_constant_folding(nir);
711
712 nir_io_add_const_offset_to_base(nir, nir_var_shader_out);
713
714 nir_foreach_function_impl(impl, nir) {
715 nir_builder b = nir_builder_create(impl);
716 nir_foreach_block(block, impl) {
717 remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode);
718 }
719 }
720 }
721
722 void
brw_nir_lower_fs_outputs(nir_shader * nir)723 brw_nir_lower_fs_outputs(nir_shader *nir)
724 {
725 nir_foreach_shader_out_variable(var, nir) {
726 var->data.driver_location =
727 SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) |
728 SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION);
729 }
730
731 nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0);
732 }
733
734 static bool
tag_speculative_access(nir_builder * b,nir_intrinsic_instr * intrin,void * unused)735 tag_speculative_access(nir_builder *b,
736 nir_intrinsic_instr *intrin,
737 void *unused)
738 {
739 if (intrin->intrinsic == nir_intrinsic_load_ubo &&
740 brw_nir_ubo_surface_index_is_pushable(intrin->src[0])) {
741 nir_intrinsic_set_access(intrin, ACCESS_CAN_SPECULATE |
742 nir_intrinsic_access(intrin));
743 return true;
744 }
745
746 return false;
747 }
748
749 static bool
brw_nir_tag_speculative_access(nir_shader * nir)750 brw_nir_tag_speculative_access(nir_shader *nir)
751 {
752 return nir_shader_intrinsics_pass(nir, tag_speculative_access,
753 nir_metadata_all, NULL);
754 }
755
756 #define OPT(pass, ...) ({ \
757 bool this_progress = false; \
758 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
759 if (this_progress) \
760 progress = true; \
761 this_progress; \
762 })
763
764 #define LOOP_OPT(pass, ...) ({ \
765 const unsigned long this_line = __LINE__; \
766 bool this_progress = false; \
767 if (opt_line == this_line) \
768 break; \
769 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
770 if (this_progress) { \
771 progress = true; \
772 opt_line = this_line; \
773 } \
774 this_progress; \
775 })
776
777 #define LOOP_OPT_NOT_IDEMPOTENT(pass, ...) ({ \
778 bool this_progress = false; \
779 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
780 if (this_progress) { \
781 progress = true; \
782 opt_line = 0; \
783 } \
784 this_progress; \
785 })
786
787 void
brw_nir_optimize(nir_shader * nir,const struct intel_device_info * devinfo)788 brw_nir_optimize(nir_shader *nir,
789 const struct intel_device_info *devinfo)
790 {
791 bool progress;
792 unsigned lower_flrp =
793 (nir->options->lower_flrp16 ? 16 : 0) |
794 (nir->options->lower_flrp32 ? 32 : 0) |
795 (nir->options->lower_flrp64 ? 64 : 0);
796
797 unsigned long opt_line = 0;
798 do {
799 progress = false;
800 /* This pass is causing problems with types used by OpenCL :
801 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
802 *
803 * Running with it disabled made no difference in the resulting assembly
804 * code.
805 */
806 if (nir->info.stage != MESA_SHADER_KERNEL)
807 LOOP_OPT(nir_split_array_vars, nir_var_function_temp);
808 LOOP_OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
809 LOOP_OPT(nir_opt_deref);
810 if (LOOP_OPT(nir_opt_memcpy))
811 LOOP_OPT(nir_split_var_copies);
812 LOOP_OPT(nir_lower_vars_to_ssa);
813 if (!nir->info.var_copies_lowered) {
814 /* Only run this pass if nir_lower_var_copies was not called
815 * yet. That would lower away any copy_deref instructions and we
816 * don't want to introduce any more.
817 */
818 LOOP_OPT(nir_opt_find_array_copies);
819 }
820 LOOP_OPT(nir_opt_copy_prop_vars);
821 LOOP_OPT(nir_opt_dead_write_vars);
822 LOOP_OPT(nir_opt_combine_stores, nir_var_all);
823
824 LOOP_OPT(nir_opt_ray_queries);
825 LOOP_OPT(nir_opt_ray_query_ranges);
826
827 LOOP_OPT(nir_lower_alu_to_scalar, NULL, NULL);
828
829 LOOP_OPT(nir_copy_prop);
830
831 LOOP_OPT(nir_lower_phis_to_scalar, false);
832
833 LOOP_OPT(nir_copy_prop);
834 LOOP_OPT(nir_opt_dce);
835 LOOP_OPT(nir_opt_cse);
836 LOOP_OPT(nir_opt_combine_stores, nir_var_all);
837
838 /* Passing 0 to the peephole select pass causes it to convert
839 * if-statements that contain only move instructions in the branches
840 * regardless of the count.
841 *
842 * Passing 1 to the peephole select pass causes it to convert
843 * if-statements that contain at most a single ALU instruction (total)
844 * in both branches. Before Gfx6, some math instructions were
845 * prohibitively expensive and the results of compare operations need an
846 * extra resolve step. For these reasons, this pass is more harmful
847 * than good on those platforms.
848 *
849 * For indirect loads of uniforms (push constants), we assume that array
850 * indices will nearly always be in bounds and the cost of the load is
851 * low. Therefore there shouldn't be a performance benefit to avoid it.
852 */
853 LOOP_OPT(nir_opt_peephole_select, 0, true, false);
854 LOOP_OPT(nir_opt_peephole_select, 8, true, true);
855
856 LOOP_OPT(nir_opt_intrinsics);
857 LOOP_OPT(nir_opt_idiv_const, 32);
858 LOOP_OPT_NOT_IDEMPOTENT(nir_opt_algebraic);
859
860 LOOP_OPT(nir_opt_generate_bfi);
861 LOOP_OPT(nir_opt_reassociate_bfi);
862
863 LOOP_OPT(nir_lower_constant_convert_alu_types);
864 LOOP_OPT(nir_opt_constant_folding);
865
866 if (lower_flrp != 0) {
867 if (LOOP_OPT(nir_lower_flrp,
868 lower_flrp,
869 false /* always_precise */)) {
870 LOOP_OPT(nir_opt_constant_folding);
871 }
872
873 /* Nothing should rematerialize any flrps, so we only need to do this
874 * lowering once.
875 */
876 lower_flrp = 0;
877 }
878
879 LOOP_OPT(nir_opt_dead_cf);
880 if (LOOP_OPT(nir_opt_loop)) {
881 /* If nir_opt_loop makes progress, then we need to clean
882 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
883 * to make progress.
884 */
885 LOOP_OPT(nir_copy_prop);
886 LOOP_OPT(nir_opt_dce);
887 }
888 LOOP_OPT_NOT_IDEMPOTENT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
889 LOOP_OPT(nir_opt_conditional_discard);
890 if (nir->options->max_unroll_iterations != 0) {
891 LOOP_OPT_NOT_IDEMPOTENT(nir_opt_loop_unroll);
892 }
893 LOOP_OPT(nir_opt_remove_phis);
894 LOOP_OPT(nir_opt_gcm, false);
895 LOOP_OPT(nir_opt_undef);
896 LOOP_OPT(nir_lower_pack);
897 } while (progress);
898
899 /* Workaround Gfxbench unused local sampler variable which will trigger an
900 * assert in the opt_large_constants pass.
901 */
902 OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
903 }
904
905 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * data)906 lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
907 {
908 switch (instr->type) {
909 case nir_instr_type_alu: {
910 nir_alu_instr *alu = nir_instr_as_alu(instr);
911 switch (alu->op) {
912 case nir_op_bit_count:
913 case nir_op_ufind_msb:
914 case nir_op_ifind_msb:
915 case nir_op_find_lsb:
916 /* These are handled specially because the destination is always
917 * 32-bit and so the bit size of the instruction is given by the
918 * source.
919 */
920 return alu->src[0].src.ssa->bit_size >= 32 ? 0 : 32;
921 default:
922 break;
923 }
924
925 if (alu->def.bit_size >= 32)
926 return 0;
927
928 /* Note: nir_op_iabs and nir_op_ineg are not lowered here because the
929 * 8-bit ABS or NEG instruction should eventually get copy propagated
930 * into the MOV that does the type conversion. This results in far
931 * fewer MOV instructions.
932 */
933 switch (alu->op) {
934 case nir_op_idiv:
935 case nir_op_imod:
936 case nir_op_irem:
937 case nir_op_udiv:
938 case nir_op_umod:
939 case nir_op_fceil:
940 case nir_op_ffloor:
941 case nir_op_ffract:
942 case nir_op_fround_even:
943 case nir_op_ftrunc:
944 return 32;
945 case nir_op_frcp:
946 case nir_op_frsq:
947 case nir_op_fsqrt:
948 case nir_op_fpow:
949 case nir_op_fexp2:
950 case nir_op_flog2:
951 case nir_op_fsin:
952 case nir_op_fcos:
953 return 0;
954 case nir_op_isign:
955 assert(!"Should have been lowered by nir_opt_algebraic.");
956 return 0;
957 default:
958 if (nir_op_infos[alu->op].num_inputs >= 2 &&
959 alu->def.bit_size == 8)
960 return 16;
961
962 if (nir_alu_instr_is_comparison(alu) &&
963 alu->src[0].src.ssa->bit_size == 8)
964 return 16;
965
966 return 0;
967 }
968 break;
969 }
970
971 case nir_instr_type_intrinsic: {
972 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
973 switch (intrin->intrinsic) {
974 case nir_intrinsic_read_invocation:
975 case nir_intrinsic_read_first_invocation:
976 case nir_intrinsic_vote_feq:
977 case nir_intrinsic_vote_ieq:
978 case nir_intrinsic_shuffle:
979 case nir_intrinsic_shuffle_xor:
980 case nir_intrinsic_shuffle_up:
981 case nir_intrinsic_shuffle_down:
982 case nir_intrinsic_quad_broadcast:
983 case nir_intrinsic_quad_swap_horizontal:
984 case nir_intrinsic_quad_swap_vertical:
985 case nir_intrinsic_quad_swap_diagonal:
986 if (intrin->src[0].ssa->bit_size == 8)
987 return 16;
988 return 0;
989
990 case nir_intrinsic_reduce:
991 case nir_intrinsic_inclusive_scan:
992 case nir_intrinsic_exclusive_scan:
993 /* There are a couple of register region issues that make things
994 * complicated for 8-bit types:
995 *
996 * 1. Only raw moves are allowed to write to a packed 8-bit
997 * destination.
998 * 2. If we use a strided destination, the efficient way to do
999 * scan operations ends up using strides that are too big to
1000 * encode in an instruction.
1001 *
1002 * To get around these issues, we just do all 8-bit scan operations
1003 * in 16 bits. It's actually fewer instructions than what we'd have
1004 * to do if we were trying to do it in native 8-bit types and the
1005 * results are the same once we truncate to 8 bits at the end.
1006 */
1007 if (intrin->def.bit_size == 8)
1008 return 16;
1009 return 0;
1010
1011 default:
1012 return 0;
1013 }
1014 break;
1015 }
1016
1017 case nir_instr_type_phi: {
1018 nir_phi_instr *phi = nir_instr_as_phi(instr);
1019 if (phi->def.bit_size == 8)
1020 return 16;
1021 return 0;
1022 }
1023
1024 default:
1025 return 0;
1026 }
1027 }
1028
1029 /* On gfx12.5+, if the offsets are not both constant and in the {-8,7} range,
1030 * we will have nir_lower_tex() lower the source offset by returning true from
1031 * this filter function.
1032 */
1033 static bool
lower_xehp_tg4_offset_filter(const nir_instr * instr,UNUSED const void * data)1034 lower_xehp_tg4_offset_filter(const nir_instr *instr, UNUSED const void *data)
1035 {
1036 if (instr->type != nir_instr_type_tex)
1037 return false;
1038
1039 nir_tex_instr *tex = nir_instr_as_tex(instr);
1040
1041 if (tex->op != nir_texop_tg4)
1042 return false;
1043
1044 int offset_index = nir_tex_instr_src_index(tex, nir_tex_src_offset);
1045 if (offset_index < 0)
1046 return false;
1047
1048 if (!nir_src_is_const(tex->src[offset_index].src))
1049 return true;
1050
1051 int64_t offset_x = nir_src_comp_as_int(tex->src[offset_index].src, 0);
1052 int64_t offset_y = nir_src_comp_as_int(tex->src[offset_index].src, 1);
1053
1054 return offset_x < -8 || offset_x > 7 || offset_y < -8 || offset_y > 7;
1055 }
1056
1057 /* Does some simple lowering and runs the standard suite of optimizations
1058 *
1059 * This is intended to be called more-or-less directly after you get the
1060 * shader out of GLSL or some other source. While it is geared towards i965,
1061 * it is not at all generator-specific.
1062 */
1063 void
brw_preprocess_nir(const struct brw_compiler * compiler,nir_shader * nir,const struct brw_nir_compiler_opts * opts)1064 brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
1065 const struct brw_nir_compiler_opts *opts)
1066 {
1067 const struct intel_device_info *devinfo = compiler->devinfo;
1068 UNUSED bool progress; /* Written by OPT */
1069
1070 nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
1071
1072 OPT(nir_lower_frexp);
1073
1074 OPT(nir_lower_alu_to_scalar, NULL, NULL);
1075
1076 if (nir->info.stage == MESA_SHADER_GEOMETRY)
1077 OPT(nir_lower_gs_intrinsics, 0);
1078
1079 /* See also brw_nir_trig_workarounds.py */
1080 if (compiler->precise_trig &&
1081 !(devinfo->ver >= 10 || devinfo->platform == INTEL_PLATFORM_KBL))
1082 OPT(brw_nir_apply_trig_workarounds);
1083
1084 /* This workaround existing for performance reasons. Since it requires not
1085 * setting RENDER_SURFACE_STATE::SurfaceArray when the array length is 1,
1086 * we're loosing the HW robustness feature in that case.
1087 *
1088 * So when robust image access is enabled, just avoid the workaround.
1089 */
1090 if (intel_needs_workaround(devinfo, 1806565034) && !opts->robust_image_access)
1091 OPT(intel_nir_clamp_image_1d_2d_array_sizes);
1092
1093 const struct intel_nir_lower_texture_opts intel_tex_options = {
1094 .combined_lod_or_bias_and_offset = compiler->devinfo->ver >= 20,
1095 };
1096 OPT(intel_nir_lower_texture, &intel_tex_options);
1097
1098 const nir_lower_tex_options tex_options = {
1099 .lower_txp = ~0,
1100 .lower_txf_offset = true,
1101 .lower_rect_offset = true,
1102 .lower_txd_cube_map = true,
1103 /* For below, See bspec 45942, "Enable new message layout for cube array" */
1104 .lower_txd_3d = devinfo->verx10 >= 125,
1105 .lower_txd_array = devinfo->verx10 >= 125,
1106 .lower_txb_shadow_clamp = true,
1107 .lower_txd_shadow_clamp = true,
1108 .lower_txd_offset_clamp = true,
1109 .lower_tg4_offsets = true,
1110 .lower_txs_lod = true, /* Wa_14012320009 */
1111 .lower_offset_filter =
1112 devinfo->verx10 >= 125 ? lower_xehp_tg4_offset_filter : NULL,
1113 .lower_invalid_implicit_lod = true,
1114 };
1115
1116 /* In the case where TG4 coords are lowered to offsets and we have a
1117 * lower_xehp_tg4_offset_filter lowering those offsets further, we need to
1118 * rerun the pass because the instructions inserted by the first lowering
1119 * are not visible during that first pass.
1120 */
1121 if (OPT(nir_lower_tex, &tex_options)) {
1122 OPT(intel_nir_lower_texture, &intel_tex_options);
1123 OPT(nir_lower_tex, &tex_options);
1124 }
1125
1126 OPT(nir_normalize_cubemap_coords);
1127
1128 OPT(nir_lower_global_vars_to_local);
1129
1130 OPT(nir_split_var_copies);
1131 OPT(nir_split_struct_vars, nir_var_function_temp);
1132
1133 brw_nir_optimize(nir, devinfo);
1134
1135 struct nir_opt_16bit_tex_image_options options = {
1136 .rounding_mode = nir_rounding_mode_undef,
1137 .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
1138 };
1139 OPT(nir_opt_16bit_tex_image, &options);
1140
1141 OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options);
1142 if (OPT(nir_lower_int64_float_conversions)) {
1143 OPT(nir_opt_algebraic);
1144 OPT(nir_lower_doubles, opts->softfp64,
1145 nir->options->lower_doubles_options);
1146 }
1147
1148 OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
1149
1150 /* Lower a bunch of stuff */
1151 OPT(nir_lower_var_copies);
1152
1153 /* This needs to be run after the first optimization pass but before we
1154 * lower indirect derefs away
1155 */
1156 OPT(nir_opt_large_constants, NULL, 32);
1157
1158 OPT(nir_lower_load_const_to_scalar);
1159
1160 OPT(nir_lower_system_values);
1161 nir_lower_compute_system_values_options lower_csv_options = {
1162 .has_base_workgroup_id = nir->info.stage == MESA_SHADER_COMPUTE,
1163 };
1164 OPT(nir_lower_compute_system_values, &lower_csv_options);
1165
1166 const nir_lower_subgroups_options subgroups_options = {
1167 .ballot_bit_size = 32,
1168 .ballot_components = 1,
1169 .lower_to_scalar = true,
1170 .lower_relative_shuffle = true,
1171 .lower_quad_broadcast_dynamic = true,
1172 .lower_elect = true,
1173 .lower_inverse_ballot = true,
1174 .lower_rotate_to_shuffle = true,
1175 };
1176 OPT(nir_lower_subgroups, &subgroups_options);
1177
1178 nir_variable_mode indirect_mask =
1179 brw_nir_no_indirect_mask(compiler, nir->info.stage);
1180 OPT(nir_lower_indirect_derefs, indirect_mask, UINT32_MAX);
1181
1182 /* Even in cases where we can handle indirect temporaries via scratch, we
1183 * it can still be expensive. Lower indirects on small arrays to
1184 * conditional load/stores.
1185 *
1186 * The threshold of 16 was chosen semi-arbitrarily. The idea is that an
1187 * indirect on an array of 16 elements is about 30 instructions at which
1188 * point, you may be better off doing a send. With a SIMD8 program, 16
1189 * floats is 1/8 of the entire register file. Any array larger than that
1190 * is likely to cause pressure issues. Also, this value is sufficiently
1191 * high that the benchmarks known to suffer from large temporary array
1192 * issues are helped but nothing else in shader-db is hurt except for maybe
1193 * that one kerbal space program shader.
1194 */
1195 if (!(indirect_mask & nir_var_function_temp))
1196 OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);
1197
1198 /* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and
1199 * SSBOs, our back-end is capable of loading an entire vec4 at a time and
1200 * we would like to take advantage of that whenever possible regardless of
1201 * whether or not the app gives us full loads. This should allow the
1202 * optimizer to combine UBO and SSBO load operations and save us some send
1203 * messages.
1204 */
1205 OPT(nir_lower_array_deref_of_vec,
1206 nir_var_mem_ubo | nir_var_mem_ssbo, NULL,
1207 nir_lower_direct_array_deref_of_vec_load);
1208
1209 /* Clamp load_per_vertex_input of the TCS stage so that we do not generate
1210 * loads reading out of bounds. We can do this here because we called
1211 * nir_lower_system_values above.
1212 */
1213 if (nir->info.stage == MESA_SHADER_TESS_CTRL &&
1214 compiler->use_tcs_multi_patch)
1215 OPT(intel_nir_clamp_per_vertex_loads);
1216
1217 /* Get rid of split copies */
1218 brw_nir_optimize(nir, devinfo);
1219 }
1220
1221 static bool
brw_nir_zero_inputs_instr(struct nir_builder * b,nir_intrinsic_instr * intrin,void * data)1222 brw_nir_zero_inputs_instr(struct nir_builder *b, nir_intrinsic_instr *intrin,
1223 void *data)
1224 {
1225 if (intrin->intrinsic != nir_intrinsic_load_deref)
1226 return false;
1227
1228 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1229 if (!nir_deref_mode_is(deref, nir_var_shader_in))
1230 return false;
1231
1232 if (deref->deref_type != nir_deref_type_var)
1233 return false;
1234
1235 nir_variable *var = deref->var;
1236
1237 uint64_t zero_inputs = *(uint64_t *)data;
1238 if (!(BITFIELD64_BIT(var->data.location) & zero_inputs))
1239 return false;
1240
1241 b->cursor = nir_before_instr(&intrin->instr);
1242
1243 nir_def *zero = nir_imm_zero(b, 1, 32);
1244
1245 nir_def_replace(&intrin->def, zero);
1246
1247 return true;
1248 }
1249
1250 static bool
brw_nir_zero_inputs(nir_shader * shader,uint64_t * zero_inputs)1251 brw_nir_zero_inputs(nir_shader *shader, uint64_t *zero_inputs)
1252 {
1253 return nir_shader_intrinsics_pass(shader, brw_nir_zero_inputs_instr,
1254 nir_metadata_control_flow,
1255 zero_inputs);
1256 }
1257
1258 /* Code for Wa_18019110168 may have created input/output variables beyond
1259 * VARYING_SLOT_MAX and removed uses of variables below VARYING_SLOT_MAX.
1260 * Clean it up, so they all stay below VARYING_SLOT_MAX.
1261 */
1262 static void
brw_mesh_compact_io(nir_shader * mesh,nir_shader * frag)1263 brw_mesh_compact_io(nir_shader *mesh, nir_shader *frag)
1264 {
1265 gl_varying_slot mapping[VARYING_SLOT_MAX] = {0, };
1266 gl_varying_slot cur = VARYING_SLOT_VAR0;
1267 bool compact = false;
1268
1269 nir_foreach_shader_out_variable(var, mesh) {
1270 gl_varying_slot location = var->data.location;
1271 if (location < VARYING_SLOT_VAR0)
1272 continue;
1273 assert(location < ARRAY_SIZE(mapping));
1274
1275 const struct glsl_type *type = var->type;
1276 if (nir_is_arrayed_io(var, MESA_SHADER_MESH)) {
1277 assert(glsl_type_is_array(type));
1278 type = glsl_get_array_element(type);
1279 }
1280
1281 if (mapping[location])
1282 continue;
1283
1284 unsigned num_slots = glsl_count_attribute_slots(type, false);
1285
1286 compact |= location + num_slots > VARYING_SLOT_MAX;
1287
1288 mapping[location] = cur;
1289 cur += num_slots;
1290 }
1291
1292 if (!compact)
1293 return;
1294
1295 /* The rest of this function should be hit only for Wa_18019110168. */
1296
1297 nir_foreach_shader_out_variable(var, mesh) {
1298 gl_varying_slot location = var->data.location;
1299 if (location < VARYING_SLOT_VAR0)
1300 continue;
1301 location = mapping[location];
1302 if (location == 0)
1303 continue;
1304 var->data.location = location;
1305 }
1306
1307 nir_foreach_shader_in_variable(var, frag) {
1308 gl_varying_slot location = var->data.location;
1309 if (location < VARYING_SLOT_VAR0)
1310 continue;
1311 location = mapping[location];
1312 if (location == 0)
1313 continue;
1314 var->data.location = location;
1315 }
1316
1317 nir_shader_gather_info(mesh, nir_shader_get_entrypoint(mesh));
1318 nir_shader_gather_info(frag, nir_shader_get_entrypoint(frag));
1319
1320 if (should_print_nir(mesh)) {
1321 printf("%s\n", __func__);
1322 nir_print_shader(mesh, stdout);
1323 }
1324 if (should_print_nir(frag)) {
1325 printf("%s\n", __func__);
1326 nir_print_shader(frag, stdout);
1327 }
1328 }
1329
1330 void
brw_nir_link_shaders(const struct brw_compiler * compiler,nir_shader * producer,nir_shader * consumer)1331 brw_nir_link_shaders(const struct brw_compiler *compiler,
1332 nir_shader *producer, nir_shader *consumer)
1333 {
1334 const struct intel_device_info *devinfo = compiler->devinfo;
1335
1336 if (producer->info.stage == MESA_SHADER_MESH &&
1337 consumer->info.stage == MESA_SHADER_FRAGMENT) {
1338 uint64_t fs_inputs = 0, ms_outputs = 0;
1339 /* gl_MeshPerPrimitiveEXT[].gl_ViewportIndex, gl_PrimitiveID and gl_Layer
1340 * are per primitive, but fragment shader does not have them marked as
1341 * such. Add the annotation here.
1342 */
1343 nir_foreach_shader_in_variable(var, consumer) {
1344 fs_inputs |= BITFIELD64_BIT(var->data.location);
1345
1346 switch (var->data.location) {
1347 case VARYING_SLOT_LAYER:
1348 case VARYING_SLOT_PRIMITIVE_ID:
1349 case VARYING_SLOT_VIEWPORT:
1350 var->data.per_primitive = 1;
1351 break;
1352 default:
1353 continue;
1354 }
1355 }
1356
1357 nir_foreach_shader_out_variable(var, producer)
1358 ms_outputs |= BITFIELD64_BIT(var->data.location);
1359
1360 uint64_t zero_inputs = ~ms_outputs & fs_inputs;
1361 zero_inputs &= BITFIELD64_BIT(VARYING_SLOT_LAYER) |
1362 BITFIELD64_BIT(VARYING_SLOT_VIEWPORT);
1363
1364 if (zero_inputs)
1365 NIR_PASS(_, consumer, brw_nir_zero_inputs, &zero_inputs);
1366 }
1367
1368 nir_lower_io_arrays_to_elements(producer, consumer);
1369 nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
1370 nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
1371
1372 NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
1373 NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
1374 brw_nir_optimize(producer, devinfo);
1375 brw_nir_optimize(consumer, devinfo);
1376
1377 if (nir_link_opt_varyings(producer, consumer))
1378 brw_nir_optimize(consumer, devinfo);
1379
1380 NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
1381 NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
1382
1383 if (nir_remove_unused_varyings(producer, consumer)) {
1384 if (should_print_nir(producer)) {
1385 printf("nir_remove_unused_varyings\n");
1386 nir_print_shader(producer, stdout);
1387 }
1388 if (should_print_nir(consumer)) {
1389 printf("nir_remove_unused_varyings\n");
1390 nir_print_shader(consumer, stdout);
1391 }
1392
1393 NIR_PASS(_, producer, nir_lower_global_vars_to_local);
1394 NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
1395
1396 /* The backend might not be able to handle indirects on
1397 * temporaries so we need to lower indirects on any of the
1398 * varyings we have demoted here.
1399 */
1400 NIR_PASS(_, producer, nir_lower_indirect_derefs,
1401 brw_nir_no_indirect_mask(compiler, producer->info.stage),
1402 UINT32_MAX);
1403 NIR_PASS(_, consumer, nir_lower_indirect_derefs,
1404 brw_nir_no_indirect_mask(compiler, consumer->info.stage),
1405 UINT32_MAX);
1406
1407 brw_nir_optimize(producer, devinfo);
1408 brw_nir_optimize(consumer, devinfo);
1409
1410 if (producer->info.stage == MESA_SHADER_MESH &&
1411 consumer->info.stage == MESA_SHADER_FRAGMENT) {
1412 brw_mesh_compact_io(producer, consumer);
1413 }
1414 }
1415
1416 NIR_PASS(_, producer, nir_lower_io_to_vector, nir_var_shader_out);
1417
1418 if (producer->info.stage == MESA_SHADER_TESS_CTRL &&
1419 producer->options->vectorize_tess_levels)
1420 NIR_PASS_V(producer, nir_vectorize_tess_levels);
1421
1422 NIR_PASS(_, producer, nir_opt_combine_stores, nir_var_shader_out);
1423 NIR_PASS(_, consumer, nir_lower_io_to_vector, nir_var_shader_in);
1424
1425 if (producer->info.stage != MESA_SHADER_TESS_CTRL &&
1426 producer->info.stage != MESA_SHADER_MESH &&
1427 producer->info.stage != MESA_SHADER_TASK) {
1428 /* Calling lower_io_to_vector creates output variable writes with
1429 * write-masks. On non-TCS outputs, the back-end can't handle it and we
1430 * need to call nir_lower_io_to_temporaries to get rid of them. This,
1431 * in turn, creates temporary variables and extra copy_deref intrinsics
1432 * that we need to clean up.
1433 *
1434 * Note Mesh/Task don't support I/O as temporaries (I/O is shared
1435 * between whole workgroup, possibly using multiple HW threads). For
1436 * those write-mask in output is handled by I/O lowering.
1437 */
1438 NIR_PASS_V(producer, nir_lower_io_to_temporaries,
1439 nir_shader_get_entrypoint(producer), true, false);
1440 NIR_PASS(_, producer, nir_lower_global_vars_to_local);
1441 NIR_PASS(_, producer, nir_split_var_copies);
1442 NIR_PASS(_, producer, nir_lower_var_copies);
1443 }
1444
1445 if (producer->info.stage == MESA_SHADER_TASK &&
1446 consumer->info.stage == MESA_SHADER_MESH) {
1447
1448 for (unsigned i = 0; i < 3; ++i)
1449 assert(producer->info.mesh.ts_mesh_dispatch_dimensions[i] <= UINT16_MAX);
1450
1451 nir_lower_compute_system_values_options options = {
1452 .lower_workgroup_id_to_index = true,
1453 .num_workgroups[0] = producer->info.mesh.ts_mesh_dispatch_dimensions[0],
1454 .num_workgroups[1] = producer->info.mesh.ts_mesh_dispatch_dimensions[1],
1455 .num_workgroups[2] = producer->info.mesh.ts_mesh_dispatch_dimensions[2],
1456 /* nir_lower_idiv generates expensive code */
1457 .shortcut_1d_workgroup_id = compiler->devinfo->verx10 >= 125,
1458 };
1459
1460 NIR_PASS(_, consumer, nir_lower_compute_system_values, &options);
1461 }
1462 }
1463
1464 bool
brw_nir_should_vectorize_mem(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)1465 brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
1466 unsigned bit_size,
1467 unsigned num_components,
1468 int64_t hole_size,
1469 nir_intrinsic_instr *low,
1470 nir_intrinsic_instr *high,
1471 void *data)
1472 {
1473 /* Don't combine things to generate 64-bit loads/stores. We have to split
1474 * those back into 32-bit ones anyway and UBO loads aren't split in NIR so
1475 * we don't want to make a mess for the back-end.
1476 */
1477 if (bit_size > 32)
1478 return false;
1479
1480 if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
1481 low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
1482 low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
1483 low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
1484 if (num_components > 4) {
1485 if (bit_size != 32)
1486 return false;
1487
1488 if (num_components > 32)
1489 return false;
1490
1491 if (hole_size >= 8 * 4)
1492 return false;
1493 }
1494 } else {
1495 /* We can handle at most a vec4 right now. Anything bigger would get
1496 * immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
1497 */
1498 if (num_components > 4)
1499 return false;
1500
1501 if (hole_size > 4)
1502 return false;
1503 }
1504
1505
1506 const uint32_t align = nir_combined_align(align_mul, align_offset);
1507
1508 if (align < bit_size / 8)
1509 return false;
1510
1511 return true;
1512 }
1513
1514 static
combine_all_memory_barriers(nir_intrinsic_instr * a,nir_intrinsic_instr * b,void * data)1515 bool combine_all_memory_barriers(nir_intrinsic_instr *a,
1516 nir_intrinsic_instr *b,
1517 void *data)
1518 {
1519 /* Combine control barriers with identical memory semantics. This prevents
1520 * the second barrier generating a spurious, identical fence message as the
1521 * first barrier.
1522 */
1523 if (nir_intrinsic_memory_modes(a) == nir_intrinsic_memory_modes(b) &&
1524 nir_intrinsic_memory_semantics(a) == nir_intrinsic_memory_semantics(b) &&
1525 nir_intrinsic_memory_scope(a) == nir_intrinsic_memory_scope(b)) {
1526 nir_intrinsic_set_execution_scope(a, MAX2(nir_intrinsic_execution_scope(a),
1527 nir_intrinsic_execution_scope(b)));
1528 return true;
1529 }
1530
1531 /* Only combine pure memory barriers */
1532 if ((nir_intrinsic_execution_scope(a) != SCOPE_NONE) ||
1533 (nir_intrinsic_execution_scope(b) != SCOPE_NONE))
1534 return false;
1535
1536 /* Translation to backend IR will get rid of modes we don't care about, so
1537 * no harm in always combining them.
1538 *
1539 * TODO: While HW has only ACQUIRE|RELEASE fences, we could improve the
1540 * scheduling so that it can take advantage of the different semantics.
1541 */
1542 nir_intrinsic_set_memory_modes(a, nir_intrinsic_memory_modes(a) |
1543 nir_intrinsic_memory_modes(b));
1544 nir_intrinsic_set_memory_semantics(a, nir_intrinsic_memory_semantics(a) |
1545 nir_intrinsic_memory_semantics(b));
1546 nir_intrinsic_set_memory_scope(a, MAX2(nir_intrinsic_memory_scope(a),
1547 nir_intrinsic_memory_scope(b)));
1548 return true;
1549 }
1550
1551 static nir_mem_access_size_align
get_mem_access_size_align(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)1552 get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes,
1553 uint8_t bit_size, uint32_t align_mul, uint32_t align_offset,
1554 bool offset_is_const, enum gl_access_qualifier access,
1555 const void *cb_data)
1556 {
1557 const uint32_t align = nir_combined_align(align_mul, align_offset);
1558
1559 switch (intrin) {
1560 case nir_intrinsic_load_ssbo:
1561 case nir_intrinsic_load_shared:
1562 case nir_intrinsic_load_scratch:
1563 /* The offset is constant so we can use a 32-bit load and just shift it
1564 * around as needed.
1565 */
1566 if (align < 4 && offset_is_const) {
1567 assert(util_is_power_of_two_nonzero(align_mul) && align_mul >= 4);
1568 const unsigned pad = align_offset % 4;
1569 const unsigned comps32 = MIN2(DIV_ROUND_UP(bytes + pad, 4), 4);
1570 return (nir_mem_access_size_align) {
1571 .bit_size = 32,
1572 .num_components = comps32,
1573 .align = 4,
1574 .shift = nir_mem_access_shift_method_scalar,
1575 };
1576 }
1577 break;
1578
1579 case nir_intrinsic_load_task_payload:
1580 if (bytes < 4 || align < 4) {
1581 return (nir_mem_access_size_align) {
1582 .bit_size = 32,
1583 .num_components = 1,
1584 .align = 4,
1585 .shift = nir_mem_access_shift_method_scalar,
1586 };
1587 }
1588 break;
1589
1590 default:
1591 break;
1592 }
1593
1594 const bool is_load = nir_intrinsic_infos[intrin].has_dest;
1595 const bool is_scratch = intrin == nir_intrinsic_load_scratch ||
1596 intrin == nir_intrinsic_store_scratch;
1597
1598 if (align < 4 || bytes < 4) {
1599 /* Choose a byte, word, or dword */
1600 bytes = MIN2(bytes, 4);
1601 if (bytes == 3)
1602 bytes = is_load ? 4 : 2;
1603
1604 if (is_scratch) {
1605 /* The way scratch address swizzling works in the back-end, it
1606 * happens at a DWORD granularity so we can't have a single load
1607 * or store cross a DWORD boundary.
1608 */
1609 if ((align_offset % 4) + bytes > MIN2(align_mul, 4))
1610 bytes = MIN2(align_mul, 4) - (align_offset % 4);
1611
1612 /* Must be a power of two */
1613 if (bytes == 3)
1614 bytes = 2;
1615 }
1616
1617 return (nir_mem_access_size_align) {
1618 .bit_size = bytes * 8,
1619 .num_components = 1,
1620 .align = 1,
1621 .shift = nir_mem_access_shift_method_scalar,
1622 };
1623 } else {
1624 bytes = MIN2(bytes, 16);
1625 return (nir_mem_access_size_align) {
1626 .bit_size = 32,
1627 .num_components = is_scratch ? 1 :
1628 is_load ? DIV_ROUND_UP(bytes, 4) : bytes / 4,
1629 .align = 4,
1630 .shift = nir_mem_access_shift_method_scalar,
1631 };
1632 }
1633 }
1634
1635 static void
brw_vectorize_lower_mem_access(nir_shader * nir,const struct brw_compiler * compiler,enum brw_robustness_flags robust_flags)1636 brw_vectorize_lower_mem_access(nir_shader *nir,
1637 const struct brw_compiler *compiler,
1638 enum brw_robustness_flags robust_flags)
1639 {
1640 bool progress = false;
1641
1642 nir_load_store_vectorize_options options = {
1643 .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
1644 nir_var_mem_global | nir_var_mem_shared |
1645 nir_var_mem_task_payload,
1646 .callback = brw_nir_should_vectorize_mem,
1647 .robust_modes = (nir_variable_mode)0,
1648 };
1649
1650 if (robust_flags & BRW_ROBUSTNESS_UBO)
1651 options.robust_modes |= nir_var_mem_ubo;
1652 if (robust_flags & BRW_ROBUSTNESS_SSBO)
1653 options.robust_modes |= nir_var_mem_ssbo;
1654
1655 OPT(nir_opt_load_store_vectorize, &options);
1656
1657 /* When HW supports block loads, using the divergence analysis, try
1658 * to find uniform SSBO loads and turn them into block loads.
1659 *
1660 * Rerun the vectorizer after that to make the largest possible block
1661 * loads.
1662 *
1663 * This is a win on 2 fronts :
1664 * - fewer send messages
1665 * - reduced register pressure
1666 */
1667 nir_divergence_analysis(nir);
1668 if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo)) {
1669 OPT(nir_opt_load_store_vectorize, &options);
1670
1671 OPT(nir_opt_constant_folding);
1672 OPT(nir_copy_prop);
1673
1674 if (OPT(brw_nir_rebase_const_offset_ubo_loads)) {
1675 OPT(nir_opt_cse);
1676 OPT(nir_copy_prop);
1677
1678 nir_load_store_vectorize_options ubo_options = {
1679 .modes = nir_var_mem_ubo,
1680 .callback = brw_nir_should_vectorize_mem,
1681 .robust_modes = options.robust_modes & nir_var_mem_ubo,
1682 };
1683
1684 OPT(nir_opt_load_store_vectorize, &ubo_options);
1685 }
1686 }
1687
1688 nir_lower_mem_access_bit_sizes_options mem_access_options = {
1689 .modes = nir_var_mem_ssbo |
1690 nir_var_mem_constant |
1691 nir_var_mem_task_payload |
1692 nir_var_shader_temp |
1693 nir_var_function_temp |
1694 nir_var_mem_global |
1695 nir_var_mem_shared,
1696 .callback = get_mem_access_size_align,
1697 };
1698 OPT(nir_lower_mem_access_bit_sizes, &mem_access_options);
1699
1700 while (progress) {
1701 progress = false;
1702
1703 OPT(nir_lower_pack);
1704 OPT(nir_copy_prop);
1705 OPT(nir_opt_dce);
1706 OPT(nir_opt_cse);
1707 OPT(nir_opt_algebraic);
1708 OPT(nir_opt_constant_folding);
1709 }
1710 }
1711
1712 static bool
nir_shader_has_local_variables(const nir_shader * nir)1713 nir_shader_has_local_variables(const nir_shader *nir)
1714 {
1715 nir_foreach_function_impl(impl, nir) {
1716 if (!exec_list_is_empty(&impl->locals))
1717 return true;
1718 }
1719
1720 return false;
1721 }
1722
1723 /* Prepare the given shader for codegen
1724 *
1725 * This function is intended to be called right before going into the actual
1726 * backend and is highly backend-specific. Also, once this function has been
1727 * called on a shader, it will no longer be in SSA form so most optimizations
1728 * will not work.
1729 */
1730 void
brw_postprocess_nir(nir_shader * nir,const struct brw_compiler * compiler,bool debug_enabled,enum brw_robustness_flags robust_flags)1731 brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
1732 bool debug_enabled,
1733 enum brw_robustness_flags robust_flags)
1734 {
1735 const struct intel_device_info *devinfo = compiler->devinfo;
1736
1737 UNUSED bool progress; /* Written by OPT */
1738
1739 OPT(intel_nir_lower_sparse_intrinsics);
1740
1741 OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
1742
1743 OPT(nir_opt_combine_barriers, combine_all_memory_barriers, NULL);
1744
1745 do {
1746 progress = false;
1747 OPT(nir_opt_algebraic_before_ffma);
1748 } while (progress);
1749
1750 if (devinfo->verx10 >= 125) {
1751 /* Lower integer division by constants before nir_lower_idiv. */
1752 OPT(nir_opt_idiv_const, 32);
1753 const nir_lower_idiv_options options = {
1754 .allow_fp16 = false
1755 };
1756 OPT(nir_lower_idiv, &options);
1757 }
1758
1759 if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
1760 NIR_PASS(_, nir, intel_nir_lower_shading_rate_output);
1761
1762 OPT(brw_nir_tag_speculative_access);
1763
1764 brw_nir_optimize(nir, devinfo);
1765
1766 if (nir_shader_has_local_variables(nir)) {
1767 OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
1768 glsl_get_natural_size_align_bytes);
1769 OPT(nir_lower_explicit_io, nir_var_function_temp,
1770 nir_address_format_32bit_offset);
1771 brw_nir_optimize(nir, devinfo);
1772 }
1773
1774 brw_vectorize_lower_mem_access(nir, compiler, robust_flags);
1775
1776 /* Needs to be prior int64 lower because it generates 64bit address
1777 * manipulations
1778 */
1779 OPT(intel_nir_lower_printf);
1780
1781 /* Potentially perform this optimization pass twice because it can create
1782 * additional opportunities for itself.
1783 */
1784 if (OPT(nir_opt_algebraic_before_lower_int64))
1785 OPT(nir_opt_algebraic_before_lower_int64);
1786
1787 if (OPT(nir_lower_int64))
1788 brw_nir_optimize(nir, devinfo);
1789
1790 /* Try and fuse multiply-adds, if successful, run shrink_vectors to
1791 * avoid peephole_ffma to generate things like this :
1792 * vec16 ssa_0 = ...
1793 * vec16 ssa_1 = fneg ssa_0
1794 * vec1 ssa_2 = ffma ssa_1, ...
1795 *
1796 * We want this instead :
1797 * vec16 ssa_0 = ...
1798 * vec1 ssa_1 = fneg ssa_0.x
1799 * vec1 ssa_2 = ffma ssa_1, ...
1800 */
1801 if (OPT(intel_nir_opt_peephole_ffma))
1802 OPT(nir_opt_shrink_vectors, false);
1803
1804 OPT(intel_nir_opt_peephole_imul32x16);
1805
1806 if (OPT(nir_opt_comparison_pre)) {
1807 OPT(nir_copy_prop);
1808 OPT(nir_opt_dce);
1809 OPT(nir_opt_cse);
1810
1811 /* Do the select peepehole again. nir_opt_comparison_pre (combined with
1812 * the other optimization passes) will have removed at least one
1813 * instruction from one of the branches of the if-statement, so now it
1814 * might be under the threshold of conversion to bcsel.
1815 */
1816 OPT(nir_opt_peephole_select, 0, false, false);
1817 OPT(nir_opt_peephole_select, 1, false, true);
1818 }
1819
1820 do {
1821 progress = false;
1822
1823 OPT(brw_nir_opt_fsat);
1824 OPT(nir_opt_algebraic_late);
1825 OPT(brw_nir_lower_fsign);
1826
1827 if (progress) {
1828 OPT(nir_opt_constant_folding);
1829 OPT(nir_copy_prop);
1830 OPT(nir_opt_dce);
1831 OPT(nir_opt_cse);
1832 }
1833 } while (progress);
1834
1835
1836 if (OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64)) {
1837 if (OPT(nir_lower_int64)) {
1838 brw_nir_optimize(nir, devinfo);
1839 }
1840 }
1841
1842 OPT(nir_lower_alu_to_scalar, NULL, NULL);
1843
1844 while (OPT(nir_opt_algebraic_distribute_src_mods)) {
1845 OPT(nir_opt_constant_folding);
1846 OPT(nir_copy_prop);
1847 OPT(nir_opt_dce);
1848 OPT(nir_opt_cse);
1849 }
1850
1851 OPT(nir_copy_prop);
1852 OPT(nir_opt_dce);
1853 OPT(nir_opt_move, nir_move_comparisons);
1854 OPT(nir_opt_dead_cf);
1855
1856 bool divergence_analysis_dirty = false;
1857 NIR_PASS_V(nir, nir_divergence_analysis);
1858
1859 static const nir_lower_subgroups_options subgroups_options = {
1860 .ballot_bit_size = 32,
1861 .ballot_components = 1,
1862 .lower_elect = true,
1863 .lower_subgroup_masks = true,
1864 };
1865
1866 if (OPT(nir_opt_uniform_atomics, false)) {
1867 OPT(nir_lower_subgroups, &subgroups_options);
1868
1869 OPT(nir_opt_algebraic_before_lower_int64);
1870
1871 if (OPT(nir_lower_int64))
1872 brw_nir_optimize(nir, devinfo);
1873
1874 divergence_analysis_dirty = true;
1875 }
1876
1877 /* nir_opt_uniform_subgroup can create some operations (e.g.,
1878 * load_subgroup_lt_mask) that need to be lowered again.
1879 */
1880 if (OPT(nir_opt_uniform_subgroup, &subgroups_options)) {
1881 /* Some of the optimizations can generate 64-bit integer multiplication
1882 * that must be lowered.
1883 */
1884 OPT(nir_lower_int64);
1885
1886 /* Even if nir_lower_int64 did not make progress, re-run the main
1887 * optimization loop. nir_opt_uniform_subgroup may have made some things
1888 * that previously appeared divergent be marked as convergent. This
1889 * allows the elimination of some loops over, say, a TXF instruction
1890 * with a non-uniform texture handle.
1891 */
1892 brw_nir_optimize(nir, devinfo);
1893
1894 OPT(nir_lower_subgroups, &subgroups_options);
1895 }
1896
1897 /* Run intel_nir_lower_conversions only after the last tiem
1898 * brw_nir_optimize is called. Various optimizations invoked there can
1899 * rematerialize the conversions that the lowering pass eliminates.
1900 */
1901 OPT(intel_nir_lower_conversions);
1902
1903 /* Do this only after the last opt_gcm. GCM will undo this lowering. */
1904 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1905 if (divergence_analysis_dirty) {
1906 NIR_PASS_V(nir, nir_divergence_analysis);
1907 }
1908
1909 OPT(intel_nir_lower_non_uniform_barycentric_at_sample);
1910 }
1911
1912 OPT(nir_lower_bool_to_int32);
1913 OPT(nir_copy_prop);
1914 OPT(nir_opt_dce);
1915
1916 OPT(nir_lower_locals_to_regs, 32);
1917
1918 if (unlikely(debug_enabled)) {
1919 /* Re-index SSA defs so we print more sensible numbers. */
1920 nir_foreach_function_impl(impl, nir) {
1921 nir_index_ssa_defs(impl);
1922 }
1923
1924 fprintf(stderr, "NIR (SSA form) for %s shader:\n",
1925 _mesa_shader_stage_to_string(nir->info.stage));
1926 nir_print_shader(nir, stderr);
1927 }
1928
1929 nir_validate_ssa_dominance(nir, "before nir_convert_from_ssa");
1930
1931 /* Rerun the divergence analysis before convert_from_ssa as this pass has
1932 * some assert on consistent divergence flags.
1933 */
1934 NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
1935 NIR_PASS_V(nir, nir_divergence_analysis);
1936
1937 OPT(nir_convert_from_ssa, true);
1938
1939 OPT(nir_opt_dce);
1940
1941 if (OPT(nir_opt_rematerialize_compares))
1942 OPT(nir_opt_dce);
1943
1944 /* The mesh stages require this pass to be called at the last minute,
1945 * but if anything is done by it, it will also constant fold, and that
1946 * undoes the work done by nir_trivialize_registers, so call it right
1947 * before that one instead.
1948 */
1949 if (nir->info.stage == MESA_SHADER_MESH ||
1950 nir->info.stage == MESA_SHADER_TASK)
1951 brw_nir_adjust_payload(nir);
1952
1953 nir_trivialize_registers(nir);
1954
1955 nir_sweep(nir);
1956
1957 if (unlikely(debug_enabled)) {
1958 fprintf(stderr, "NIR (final form) for %s shader:\n",
1959 _mesa_shader_stage_to_string(nir->info.stage));
1960 nir_print_shader(nir, stderr);
1961 }
1962 }
1963
1964 static unsigned
get_subgroup_size(const struct shader_info * info,unsigned max_subgroup_size)1965 get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
1966 {
1967 switch (info->subgroup_size) {
1968 case SUBGROUP_SIZE_API_CONSTANT:
1969 /* We have to use the global constant size. */
1970 return BRW_SUBGROUP_SIZE;
1971
1972 case SUBGROUP_SIZE_UNIFORM:
1973 /* It has to be uniform across all invocations but can vary per stage
1974 * if we want. This gives us a bit more freedom.
1975 *
1976 * For compute, brw_nir_apply_key is called per-dispatch-width so this
1977 * is the actual subgroup size and not a maximum. However, we only
1978 * invoke one size of any given compute shader so it's still guaranteed
1979 * to be uniform across invocations.
1980 */
1981 return max_subgroup_size;
1982
1983 case SUBGROUP_SIZE_VARYING:
1984 /* The subgroup size is allowed to be fully varying. For geometry
1985 * stages, we know it's always 8 which is max_subgroup_size so we can
1986 * return that. For compute, brw_nir_apply_key is called once per
1987 * dispatch-width so max_subgroup_size is the real subgroup size.
1988 *
1989 * For fragment, we return 0 and let it fall through to the back-end
1990 * compiler. This means we can't optimize based on subgroup size but
1991 * that's a risk the client took when it asked for a varying subgroup
1992 * size.
1993 */
1994 return info->stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
1995
1996 case SUBGROUP_SIZE_REQUIRE_4:
1997 unreachable("Unsupported subgroup size type");
1998
1999 case SUBGROUP_SIZE_REQUIRE_8:
2000 case SUBGROUP_SIZE_REQUIRE_16:
2001 case SUBGROUP_SIZE_REQUIRE_32:
2002 assert(gl_shader_stage_uses_workgroup(info->stage) ||
2003 (info->stage >= MESA_SHADER_RAYGEN && info->stage <= MESA_SHADER_CALLABLE));
2004 /* These enum values are expressly chosen to be equal to the subgroup
2005 * size that they require.
2006 */
2007 return info->subgroup_size;
2008
2009 case SUBGROUP_SIZE_FULL_SUBGROUPS:
2010 case SUBGROUP_SIZE_REQUIRE_64:
2011 case SUBGROUP_SIZE_REQUIRE_128:
2012 break;
2013 }
2014
2015 unreachable("Invalid subgroup size type");
2016 }
2017
2018 unsigned
brw_nir_api_subgroup_size(const nir_shader * nir,unsigned hw_subgroup_size)2019 brw_nir_api_subgroup_size(const nir_shader *nir,
2020 unsigned hw_subgroup_size)
2021 {
2022 return get_subgroup_size(&nir->info, hw_subgroup_size);
2023 }
2024
2025 void
brw_nir_apply_key(nir_shader * nir,const struct brw_compiler * compiler,const struct brw_base_prog_key * key,unsigned max_subgroup_size)2026 brw_nir_apply_key(nir_shader *nir,
2027 const struct brw_compiler *compiler,
2028 const struct brw_base_prog_key *key,
2029 unsigned max_subgroup_size)
2030 {
2031 bool progress = false;
2032
2033 nir_lower_tex_options nir_tex_opts = {
2034 .lower_txd_clamp_bindless_sampler = true,
2035 .lower_txd_clamp_if_sampler_index_not_lt_16 = true,
2036 .lower_invalid_implicit_lod = true,
2037 .lower_index_to_offset = true,
2038 };
2039 OPT(nir_lower_tex, &nir_tex_opts);
2040
2041 const struct intel_nir_lower_texture_opts tex_opts = {
2042 .combined_lod_and_array_index = compiler->devinfo->ver >= 20,
2043 };
2044 OPT(intel_nir_lower_texture, &tex_opts);
2045
2046 const nir_lower_subgroups_options subgroups_options = {
2047 .subgroup_size = get_subgroup_size(&nir->info, max_subgroup_size),
2048 .ballot_bit_size = 32,
2049 .ballot_components = 1,
2050 .lower_subgroup_masks = true,
2051 };
2052 OPT(nir_lower_subgroups, &subgroups_options);
2053
2054 if (key->limit_trig_input_range)
2055 OPT(brw_nir_limit_trig_input_range_workaround);
2056
2057 if (progress) {
2058 brw_nir_optimize(nir, compiler->devinfo);
2059 }
2060 }
2061
2062 enum brw_conditional_mod
brw_cmod_for_nir_comparison(nir_op op)2063 brw_cmod_for_nir_comparison(nir_op op)
2064 {
2065 switch (op) {
2066 case nir_op_flt:
2067 case nir_op_flt32:
2068 case nir_op_ilt:
2069 case nir_op_ilt32:
2070 case nir_op_ult:
2071 case nir_op_ult32:
2072 return BRW_CONDITIONAL_L;
2073
2074 case nir_op_fge:
2075 case nir_op_fge32:
2076 case nir_op_ige:
2077 case nir_op_ige32:
2078 case nir_op_uge:
2079 case nir_op_uge32:
2080 return BRW_CONDITIONAL_GE;
2081
2082 case nir_op_feq:
2083 case nir_op_feq32:
2084 case nir_op_ieq:
2085 case nir_op_ieq32:
2086 case nir_op_b32all_fequal2:
2087 case nir_op_b32all_iequal2:
2088 case nir_op_b32all_fequal3:
2089 case nir_op_b32all_iequal3:
2090 case nir_op_b32all_fequal4:
2091 case nir_op_b32all_iequal4:
2092 return BRW_CONDITIONAL_Z;
2093
2094 case nir_op_fneu:
2095 case nir_op_fneu32:
2096 case nir_op_ine:
2097 case nir_op_ine32:
2098 case nir_op_b32any_fnequal2:
2099 case nir_op_b32any_inequal2:
2100 case nir_op_b32any_fnequal3:
2101 case nir_op_b32any_inequal3:
2102 case nir_op_b32any_fnequal4:
2103 case nir_op_b32any_inequal4:
2104 return BRW_CONDITIONAL_NZ;
2105
2106 default:
2107 unreachable("Unsupported NIR comparison op");
2108 }
2109 }
2110
2111 enum lsc_opcode
lsc_op_for_nir_intrinsic(const nir_intrinsic_instr * intrin)2112 lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin)
2113 {
2114 switch (intrin->intrinsic) {
2115 case nir_intrinsic_load_ssbo:
2116 case nir_intrinsic_load_shared:
2117 case nir_intrinsic_load_global:
2118 case nir_intrinsic_load_global_block_intel:
2119 case nir_intrinsic_load_global_constant:
2120 case nir_intrinsic_load_global_constant_uniform_block_intel:
2121 case nir_intrinsic_load_shared_block_intel:
2122 case nir_intrinsic_load_shared_uniform_block_intel:
2123 case nir_intrinsic_load_ssbo_block_intel:
2124 case nir_intrinsic_load_ssbo_uniform_block_intel:
2125 case nir_intrinsic_load_ubo_uniform_block_intel:
2126 case nir_intrinsic_load_scratch:
2127 return LSC_OP_LOAD;
2128
2129 case nir_intrinsic_store_ssbo:
2130 case nir_intrinsic_store_shared:
2131 case nir_intrinsic_store_global:
2132 case nir_intrinsic_store_global_block_intel:
2133 case nir_intrinsic_store_shared_block_intel:
2134 case nir_intrinsic_store_ssbo_block_intel:
2135 case nir_intrinsic_store_scratch:
2136 return LSC_OP_STORE;
2137
2138 case nir_intrinsic_image_load:
2139 case nir_intrinsic_bindless_image_load:
2140 return LSC_OP_LOAD_CMASK;
2141
2142 case nir_intrinsic_image_store:
2143 case nir_intrinsic_bindless_image_store:
2144 return LSC_OP_STORE_CMASK;
2145
2146 default:
2147 assert(nir_intrinsic_has_atomic_op(intrin));
2148 break;
2149 }
2150
2151 switch (nir_intrinsic_atomic_op(intrin)) {
2152 case nir_atomic_op_iadd: {
2153 unsigned src_idx;
2154 switch (intrin->intrinsic) {
2155 case nir_intrinsic_image_atomic:
2156 case nir_intrinsic_bindless_image_atomic:
2157 src_idx = 3;
2158 break;
2159 case nir_intrinsic_ssbo_atomic:
2160 src_idx = 2;
2161 break;
2162 case nir_intrinsic_shared_atomic:
2163 case nir_intrinsic_global_atomic:
2164 src_idx = 1;
2165 break;
2166 default:
2167 unreachable("Invalid add atomic opcode");
2168 }
2169
2170 if (nir_src_is_const(intrin->src[src_idx])) {
2171 int64_t add_val = nir_src_as_int(intrin->src[src_idx]);
2172 if (add_val == 1)
2173 return LSC_OP_ATOMIC_INC;
2174 else if (add_val == -1)
2175 return LSC_OP_ATOMIC_DEC;
2176 }
2177 return LSC_OP_ATOMIC_ADD;
2178 }
2179
2180 case nir_atomic_op_imin: return LSC_OP_ATOMIC_MIN;
2181 case nir_atomic_op_umin: return LSC_OP_ATOMIC_UMIN;
2182 case nir_atomic_op_imax: return LSC_OP_ATOMIC_MAX;
2183 case nir_atomic_op_umax: return LSC_OP_ATOMIC_UMAX;
2184 case nir_atomic_op_iand: return LSC_OP_ATOMIC_AND;
2185 case nir_atomic_op_ior: return LSC_OP_ATOMIC_OR;
2186 case nir_atomic_op_ixor: return LSC_OP_ATOMIC_XOR;
2187 case nir_atomic_op_xchg: return LSC_OP_ATOMIC_STORE;
2188 case nir_atomic_op_cmpxchg: return LSC_OP_ATOMIC_CMPXCHG;
2189
2190 case nir_atomic_op_fmin: return LSC_OP_ATOMIC_FMIN;
2191 case nir_atomic_op_fmax: return LSC_OP_ATOMIC_FMAX;
2192 case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG;
2193 case nir_atomic_op_fadd: return LSC_OP_ATOMIC_FADD;
2194
2195 default:
2196 unreachable("Unsupported NIR atomic intrinsic");
2197 }
2198 }
2199
2200 enum brw_reg_type
brw_type_for_nir_type(const struct intel_device_info * devinfo,nir_alu_type type)2201 brw_type_for_nir_type(const struct intel_device_info *devinfo,
2202 nir_alu_type type)
2203 {
2204 switch (type) {
2205 case nir_type_uint:
2206 case nir_type_uint32:
2207 return BRW_TYPE_UD;
2208 case nir_type_bool:
2209 case nir_type_int:
2210 case nir_type_bool32:
2211 case nir_type_int32:
2212 return BRW_TYPE_D;
2213 case nir_type_float:
2214 case nir_type_float32:
2215 return BRW_TYPE_F;
2216 case nir_type_float16:
2217 return BRW_TYPE_HF;
2218 case nir_type_float64:
2219 return BRW_TYPE_DF;
2220 case nir_type_int64:
2221 return BRW_TYPE_Q;
2222 case nir_type_uint64:
2223 return BRW_TYPE_UQ;
2224 case nir_type_int16:
2225 return BRW_TYPE_W;
2226 case nir_type_uint16:
2227 return BRW_TYPE_UW;
2228 case nir_type_int8:
2229 return BRW_TYPE_B;
2230 case nir_type_uint8:
2231 return BRW_TYPE_UB;
2232 default:
2233 unreachable("unknown type");
2234 }
2235
2236 return BRW_TYPE_F;
2237 }
2238
2239 nir_shader *
brw_nir_create_passthrough_tcs(void * mem_ctx,const struct brw_compiler * compiler,const struct brw_tcs_prog_key * key)2240 brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compiler,
2241 const struct brw_tcs_prog_key *key)
2242 {
2243 assert(key->input_vertices > 0);
2244
2245 const nir_shader_compiler_options *options =
2246 compiler->nir_options[MESA_SHADER_TESS_CTRL];
2247
2248 uint64_t inputs_read = key->outputs_written &
2249 ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
2250
2251 unsigned locations[64];
2252 unsigned num_locations = 0;
2253
2254 u_foreach_bit64(varying, inputs_read)
2255 locations[num_locations++] = varying;
2256
2257 nir_shader *nir =
2258 nir_create_passthrough_tcs_impl(options, locations, num_locations,
2259 key->input_vertices);
2260
2261 ralloc_steal(mem_ctx, nir);
2262
2263 nir->info.inputs_read = inputs_read;
2264 nir->info.tess._primitive_mode = key->_tes_primitive_mode;
2265 nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs");
2266
2267 struct brw_nir_compiler_opts opts = {};
2268 brw_preprocess_nir(compiler, nir, &opts);
2269
2270 return nir;
2271 }
2272
2273 nir_def *
brw_nir_load_global_const(nir_builder * b,nir_intrinsic_instr * load_uniform,nir_def * base_addr,unsigned off)2274 brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
2275 nir_def *base_addr, unsigned off)
2276 {
2277 assert(load_uniform->intrinsic == nir_intrinsic_load_uniform);
2278
2279 unsigned bit_size = load_uniform->def.bit_size;
2280 assert(bit_size >= 8 && bit_size % 8 == 0);
2281 unsigned byte_size = bit_size / 8;
2282 nir_def *sysval;
2283
2284 if (nir_src_is_const(load_uniform->src[0])) {
2285 uint64_t offset = off +
2286 nir_intrinsic_base(load_uniform) +
2287 nir_src_as_uint(load_uniform->src[0]);
2288
2289 /* Things should be component-aligned. */
2290 assert(offset % byte_size == 0);
2291
2292 unsigned suboffset = offset % 64;
2293 uint64_t aligned_offset = offset - suboffset;
2294
2295 /* Load two just in case we go over a 64B boundary */
2296 nir_def *data[2];
2297 for (unsigned i = 0; i < 2; i++) {
2298 nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
2299
2300 data[i] = nir_load_global_constant_uniform_block_intel(
2301 b, 16, 32, addr,
2302 .access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE,
2303 .align_mul = 64);
2304 }
2305
2306 sysval = nir_extract_bits(b, data, 2, suboffset * 8,
2307 load_uniform->num_components, bit_size);
2308 } else {
2309 nir_def *offset32 =
2310 nir_iadd_imm(b, load_uniform->src[0].ssa,
2311 off + nir_intrinsic_base(load_uniform));
2312 nir_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset32));
2313 sysval = nir_load_global_constant(b, addr, byte_size,
2314 load_uniform->num_components, bit_size);
2315 }
2316
2317 return sysval;
2318 }
2319
2320 const struct glsl_type *
brw_nir_get_var_type(const struct nir_shader * nir,nir_variable * var)2321 brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
2322 {
2323 const struct glsl_type *type = var->interface_type;
2324 if (!type) {
2325 type = var->type;
2326 if (nir_is_arrayed_io(var, nir->info.stage)) {
2327 assert(glsl_type_is_array(type));
2328 type = glsl_get_array_element(type);
2329 }
2330 }
2331
2332 return type;
2333 }
2334
2335 bool
brw_nir_uses_inline_data(nir_shader * shader)2336 brw_nir_uses_inline_data(nir_shader *shader)
2337 {
2338 nir_foreach_function_impl(impl, shader) {
2339 nir_foreach_block(block, impl) {
2340 nir_foreach_instr(instr, block) {
2341 if (instr->type != nir_instr_type_intrinsic)
2342 continue;
2343
2344 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2345 if (intrin->intrinsic != nir_intrinsic_load_inline_data_intel)
2346 continue;
2347
2348 return true;
2349 }
2350 }
2351 }
2352
2353 return false;
2354 }
2355
2356 /**
2357 * Move load_interpolated_input with simple (payload-based) barycentric modes
2358 * to the top of the program so we don't emit multiple PLNs for the same input.
2359 *
2360 * This works around CSE not being able to handle non-dominating cases
2361 * such as:
2362 *
2363 * if (...) {
2364 * interpolate input
2365 * } else {
2366 * interpolate the same exact input
2367 * }
2368 *
2369 * This should be replaced by global value numbering someday.
2370 */
2371 bool
brw_nir_move_interpolation_to_top(nir_shader * nir)2372 brw_nir_move_interpolation_to_top(nir_shader *nir)
2373 {
2374 bool progress = false;
2375
2376 nir_foreach_function_impl(impl, nir) {
2377 nir_block *top = nir_start_block(impl);
2378 nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
2379 bool impl_progress = false;
2380
2381 for (nir_block *block = nir_block_cf_tree_next(top);
2382 block != NULL;
2383 block = nir_block_cf_tree_next(block)) {
2384
2385 nir_foreach_instr_safe(instr, block) {
2386 if (instr->type != nir_instr_type_intrinsic)
2387 continue;
2388
2389 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2390 if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
2391 continue;
2392 nir_intrinsic_instr *bary_intrinsic =
2393 nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
2394 nir_intrinsic_op op = bary_intrinsic->intrinsic;
2395
2396 /* Leave interpolateAtSample/Offset() where they are. */
2397 if (op == nir_intrinsic_load_barycentric_at_sample ||
2398 op == nir_intrinsic_load_barycentric_at_offset)
2399 continue;
2400
2401 nir_instr *move[3] = {
2402 &bary_intrinsic->instr,
2403 intrin->src[1].ssa->parent_instr,
2404 instr
2405 };
2406
2407 for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
2408 if (move[i]->block != top) {
2409 nir_instr_move(cursor, move[i]);
2410 impl_progress = true;
2411 }
2412 }
2413 }
2414 }
2415
2416 progress = progress || impl_progress;
2417
2418 nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
2419 : nir_metadata_all);
2420 }
2421
2422 return progress;
2423 }
2424
2425 static bool
filter_simd(const nir_instr * instr,UNUSED const void * options)2426 filter_simd(const nir_instr *instr, UNUSED const void *options)
2427 {
2428 if (instr->type != nir_instr_type_intrinsic)
2429 return false;
2430
2431 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
2432 case nir_intrinsic_load_simd_width_intel:
2433 case nir_intrinsic_load_subgroup_id:
2434 return true;
2435
2436 default:
2437 return false;
2438 }
2439 }
2440
2441 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)2442 lower_simd(nir_builder *b, nir_instr *instr, void *options)
2443 {
2444 uintptr_t simd_width = (uintptr_t)options;
2445
2446 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
2447 case nir_intrinsic_load_simd_width_intel:
2448 return nir_imm_int(b, simd_width);
2449
2450 case nir_intrinsic_load_subgroup_id:
2451 /* If the whole workgroup fits in one thread, we can lower subgroup_id
2452 * to a constant zero.
2453 */
2454 if (!b->shader->info.workgroup_size_variable) {
2455 unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
2456 b->shader->info.workgroup_size[1] *
2457 b->shader->info.workgroup_size[2];
2458 if (local_workgroup_size <= simd_width)
2459 return nir_imm_int(b, 0);
2460 }
2461 return NULL;
2462
2463 default:
2464 return NULL;
2465 }
2466 }
2467
2468 bool
brw_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)2469 brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
2470 {
2471 return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
2472 (void *)(uintptr_t)dispatch_width);
2473 }
2474
2475
2476