• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "intel_nir.h"
25 #include "brw_nir.h"
26 #include "compiler/glsl_types.h"
27 #include "compiler/nir/nir_builder.h"
28 
29 /*
30  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
31  * false) elements needed to pack a type.
32  */
33 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)34 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
35 {
36    unsigned int i;
37    int size;
38 
39    switch (type->base_type) {
40    case GLSL_TYPE_UINT:
41    case GLSL_TYPE_INT:
42    case GLSL_TYPE_FLOAT:
43    case GLSL_TYPE_FLOAT16:
44    case GLSL_TYPE_BOOL:
45    case GLSL_TYPE_DOUBLE:
46    case GLSL_TYPE_UINT16:
47    case GLSL_TYPE_INT16:
48    case GLSL_TYPE_UINT8:
49    case GLSL_TYPE_INT8:
50    case GLSL_TYPE_UINT64:
51    case GLSL_TYPE_INT64:
52       if (glsl_type_is_matrix(type)) {
53          const glsl_type *col_type = glsl_get_column_type(type);
54          unsigned col_slots =
55             (as_vec4 && glsl_type_is_dual_slot(col_type)) ? 2 : 1;
56          return type->matrix_columns * col_slots;
57       } else {
58          /* Regardless of size of vector, it gets a vec4. This is bad
59           * packing for things like floats, but otherwise arrays become a
60           * mess.  Hopefully a later pass over the code can pack scalars
61           * down if appropriate.
62           */
63          return (as_vec4 && glsl_type_is_dual_slot(type)) ? 2 : 1;
64       }
65    case GLSL_TYPE_ARRAY:
66       assert(type->length > 0);
67       return type_size_xvec4(type->fields.array, as_vec4, bindless) *
68              type->length;
69    case GLSL_TYPE_STRUCT:
70    case GLSL_TYPE_INTERFACE:
71       size = 0;
72       for (i = 0; i < type->length; i++) {
73 	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
74                                  bindless);
75       }
76       return size;
77    case GLSL_TYPE_SUBROUTINE:
78       return 1;
79 
80    case GLSL_TYPE_SAMPLER:
81    case GLSL_TYPE_TEXTURE:
82       /* Samplers and textures take up no register space, since they're baked
83        * in at link time.
84        */
85       return bindless ? 1 : 0;
86    case GLSL_TYPE_ATOMIC_UINT:
87       return 0;
88    case GLSL_TYPE_IMAGE:
89       return bindless ? 1 : 0;
90    case GLSL_TYPE_VOID:
91    case GLSL_TYPE_ERROR:
92    case GLSL_TYPE_COOPERATIVE_MATRIX:
93       unreachable("not reached");
94    }
95 
96    return 0;
97 }
98 
99 /**
100  * Returns the minimum number of vec4 elements needed to pack a type.
101  *
102  * For simple types, it will return 1 (a single vec4); for matrices, the
103  * number of columns; for array and struct, the sum of the vec4_size of
104  * each of its elements; and for sampler and atomic, zero.
105  *
106  * This method is useful to calculate how much register space is needed to
107  * store a particular type.
108  */
109 int
type_size_vec4(const struct glsl_type * type,bool bindless)110 type_size_vec4(const struct glsl_type *type, bool bindless)
111 {
112    return type_size_xvec4(type, true, bindless);
113 }
114 
115 /**
116  * Returns the minimum number of dvec4 elements needed to pack a type.
117  *
118  * For simple types, it will return 1 (a single dvec4); for matrices, the
119  * number of columns; for array and struct, the sum of the dvec4_size of
120  * each of its elements; and for sampler and atomic, zero.
121  *
122  * This method is useful to calculate how much register space is needed to
123  * store a particular type.
124  *
125  * Measuring double-precision vertex inputs as dvec4 is required because
126  * ARB_vertex_attrib_64bit states that these uses the same number of locations
127  * than the single-precision version. That is, two consecutives dvec4 would be
128  * located in location "x" and location "x+1", not "x+2".
129  *
130  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
131  * remap_vs_attrs() will take in account both the location and also if the
132  * type fits in one or two vec4 slots.
133  */
134 int
type_size_dvec4(const struct glsl_type * type,bool bindless)135 type_size_dvec4(const struct glsl_type *type, bool bindless)
136 {
137    return type_size_xvec4(type, false, bindless);
138 }
139 
140 static bool
remap_tess_levels(nir_builder * b,nir_intrinsic_instr * intr,enum tess_primitive_mode _primitive_mode)141 remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr,
142                   enum tess_primitive_mode _primitive_mode)
143 {
144    const int location = nir_intrinsic_base(intr);
145    const unsigned component = nir_intrinsic_component(intr);
146    bool out_of_bounds = false;
147    bool write = !nir_intrinsic_infos[intr->intrinsic].has_dest;
148    unsigned mask = write ? nir_intrinsic_write_mask(intr) : 0;
149    nir_def *src = NULL, *dest = NULL;
150 
151    if (write) {
152       assert(intr->num_components == intr->src[0].ssa->num_components);
153    } else {
154       assert(intr->num_components == intr->def.num_components);
155    }
156 
157    if (location == VARYING_SLOT_TESS_LEVEL_INNER) {
158       b->cursor = write ? nir_before_instr(&intr->instr)
159                         : nir_after_instr(&intr->instr);
160 
161       switch (_primitive_mode) {
162       case TESS_PRIMITIVE_QUADS:
163          /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */
164          nir_intrinsic_set_base(intr, 0);
165 
166          if (write) {
167             assert(intr->src[0].ssa->num_components == 2);
168 
169             intr->num_components = 4;
170 
171             nir_def *undef = nir_undef(b, 1, 32);
172             nir_def *x = nir_channel(b, intr->src[0].ssa, 0);
173             nir_def *y = nir_channel(b, intr->src[0].ssa, 1);
174             src = nir_vec4(b, undef, undef, y, x);
175             mask = !!(mask & WRITEMASK_X) << 3 | !!(mask & WRITEMASK_Y) << 2;
176          } else if (intr->def.num_components > 1) {
177             assert(intr->def.num_components == 2);
178 
179             intr->num_components = 4;
180             intr->def.num_components = 4;
181 
182             unsigned wz[2] = { 3, 2 };
183             dest = nir_swizzle(b, &intr->def, wz, 2);
184          } else {
185             nir_intrinsic_set_component(intr, 3 - component);
186          }
187          break;
188       case TESS_PRIMITIVE_TRIANGLES:
189          /* gl_TessLevelInner[0] lives at DWord 4. */
190          nir_intrinsic_set_base(intr, 1);
191          mask &= WRITEMASK_X;
192          out_of_bounds = component > 0;
193          break;
194       case TESS_PRIMITIVE_ISOLINES:
195          out_of_bounds = true;
196          break;
197       default:
198          unreachable("Bogus tessellation domain");
199       }
200    } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) {
201       b->cursor = write ? nir_before_instr(&intr->instr)
202                         : nir_after_instr(&intr->instr);
203 
204       nir_intrinsic_set_base(intr, 1);
205 
206       switch (_primitive_mode) {
207       case TESS_PRIMITIVE_QUADS:
208       case TESS_PRIMITIVE_TRIANGLES:
209          /* Quads:     gl_TessLevelOuter[0..3] lives at DWords 7-4 (reversed).
210           * Triangles: gl_TessLevelOuter[0..2] lives at DWords 7-5 (reversed).
211           */
212          if (write) {
213             assert(intr->src[0].ssa->num_components == 4);
214 
215             unsigned wzyx[4] = { 3, 2, 1, 0 };
216             src = nir_swizzle(b, intr->src[0].ssa, wzyx, 4);
217             mask = !!(mask & WRITEMASK_X) << 3 | !!(mask & WRITEMASK_Y) << 2 |
218                    !!(mask & WRITEMASK_Z) << 1 | !!(mask & WRITEMASK_W) << 0;
219 
220             /* Don't overwrite the inner factor at DWord 4 for triangles */
221             if (_primitive_mode == TESS_PRIMITIVE_TRIANGLES)
222                mask &= ~WRITEMASK_X;
223          } else if (intr->def.num_components > 1) {
224             assert(intr->def.num_components == 4);
225 
226             unsigned wzyx[4] = { 3, 2, 1, 0 };
227             dest = nir_swizzle(b, &intr->def, wzyx, 4);
228          } else {
229             nir_intrinsic_set_component(intr, 3 - component);
230             out_of_bounds = component == 3 &&
231                             _primitive_mode == TESS_PRIMITIVE_TRIANGLES;
232          }
233          break;
234       case TESS_PRIMITIVE_ISOLINES:
235          /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */
236          if (write) {
237             assert(intr->src[0].ssa->num_components == 4);
238 
239             nir_def *undef = nir_undef(b, 1, 32);
240             nir_def *x = nir_channel(b, intr->src[0].ssa, 0);
241             nir_def *y = nir_channel(b, intr->src[0].ssa, 1);
242             src = nir_vec4(b, undef, undef, x, y);
243             mask = !!(mask & WRITEMASK_X) << 2 | !!(mask & WRITEMASK_Y) << 3;
244          } else {
245             nir_intrinsic_set_component(intr, 2 + component);
246             out_of_bounds = component > 1;
247          }
248          break;
249       default:
250          unreachable("Bogus tessellation domain");
251       }
252    } else {
253       return false;
254    }
255 
256    if (out_of_bounds) {
257       if (!write)
258          nir_def_rewrite_uses(&intr->def, nir_undef(b, 1, 32));
259       nir_instr_remove(&intr->instr);
260    } else if (write) {
261       nir_intrinsic_set_write_mask(intr, mask);
262 
263       if (src) {
264          nir_src_rewrite(&intr->src[0], src);
265       }
266    } else if (dest) {
267       nir_def_rewrite_uses_after(&intr->def, dest,
268                                      dest->parent_instr);
269    }
270 
271    return true;
272 }
273 
274 static bool
is_input(nir_intrinsic_instr * intrin)275 is_input(nir_intrinsic_instr *intrin)
276 {
277    return intrin->intrinsic == nir_intrinsic_load_input ||
278           intrin->intrinsic == nir_intrinsic_load_per_primitive_input ||
279           intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
280           intrin->intrinsic == nir_intrinsic_load_interpolated_input;
281 }
282 
283 static bool
is_output(nir_intrinsic_instr * intrin)284 is_output(nir_intrinsic_instr *intrin)
285 {
286    return intrin->intrinsic == nir_intrinsic_load_output ||
287           intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
288           intrin->intrinsic == nir_intrinsic_load_per_view_output ||
289           intrin->intrinsic == nir_intrinsic_store_output ||
290           intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
291           intrin->intrinsic == nir_intrinsic_store_per_view_output;
292 }
293 
294 
295 static bool
remap_patch_urb_offsets(nir_block * block,nir_builder * b,const struct intel_vue_map * vue_map,enum tess_primitive_mode tes_primitive_mode)296 remap_patch_urb_offsets(nir_block *block, nir_builder *b,
297                         const struct intel_vue_map *vue_map,
298                         enum tess_primitive_mode tes_primitive_mode)
299 {
300    nir_foreach_instr_safe(instr, block) {
301       if (instr->type != nir_instr_type_intrinsic)
302          continue;
303 
304       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
305 
306       gl_shader_stage stage = b->shader->info.stage;
307 
308       if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
309           (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
310 
311          if (remap_tess_levels(b, intrin, tes_primitive_mode))
312             continue;
313 
314          int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
315          assert(vue_slot != -1);
316          intrin->const_index[0] = vue_slot;
317 
318          nir_src *vertex = nir_get_io_arrayed_index_src(intrin);
319          if (vertex) {
320             if (nir_src_is_const(*vertex)) {
321                intrin->const_index[0] += nir_src_as_uint(*vertex) *
322                                          vue_map->num_per_vertex_slots;
323             } else {
324                b->cursor = nir_before_instr(&intrin->instr);
325 
326                /* Multiply by the number of per-vertex slots. */
327                nir_def *vertex_offset =
328                   nir_imul(b,
329                            vertex->ssa,
330                            nir_imm_int(b,
331                                        vue_map->num_per_vertex_slots));
332 
333                /* Add it to the existing offset */
334                nir_src *offset = nir_get_io_offset_src(intrin);
335                nir_def *total_offset =
336                   nir_iadd(b, vertex_offset,
337                            offset->ssa);
338 
339                nir_src_rewrite(offset, total_offset);
340             }
341          }
342       }
343    }
344    return true;
345 }
346 
347 /* Replace store_per_view_output to plain store_output, mapping the view index
348  * to IO offset. Because we only use per-view outputs for position, the offset
349  * pitch is always 1. */
350 static bool
lower_per_view_outputs(nir_builder * b,nir_intrinsic_instr * intrin,UNUSED void * cb_data)351 lower_per_view_outputs(nir_builder *b,
352                        nir_intrinsic_instr *intrin,
353                        UNUSED void *cb_data)
354 {
355    if (intrin->intrinsic != nir_intrinsic_store_per_view_output &&
356        intrin->intrinsic != nir_intrinsic_load_per_view_output)
357       return false;
358 
359    b->cursor = nir_before_instr(&intrin->instr);
360 
361    nir_src *view_index = nir_get_io_arrayed_index_src(intrin);
362    nir_src *offset = nir_get_io_offset_src(intrin);
363 
364    nir_def *new_offset = nir_iadd(b, view_index->ssa, offset->ssa);
365 
366    nir_intrinsic_instr *new;
367    if (intrin->intrinsic == nir_intrinsic_store_per_view_output)
368       new = nir_store_output(b, intrin->src[0].ssa, new_offset);
369    else {
370       nir_def *new_def = nir_load_output(b, intrin->def.num_components,
371                                          intrin->def.bit_size, new_offset);
372       new = nir_instr_as_intrinsic(new_def->parent_instr);
373    }
374 
375    nir_intrinsic_set_base(new, nir_intrinsic_base(intrin));
376    nir_intrinsic_set_range(new, nir_intrinsic_range(intrin));
377    nir_intrinsic_set_write_mask(new, nir_intrinsic_write_mask(intrin));
378    nir_intrinsic_set_component(new, nir_intrinsic_component(intrin));
379    nir_intrinsic_set_src_type(new, nir_intrinsic_src_type(intrin));
380    nir_intrinsic_set_io_semantics(new, nir_intrinsic_io_semantics(intrin));
381 
382    if (intrin->intrinsic == nir_intrinsic_load_per_view_output)
383       nir_def_rewrite_uses(&intrin->def, &new->def);
384    nir_instr_remove(&intrin->instr);
385 
386    return true;
387 }
388 
389 static bool
brw_nir_lower_per_view_outputs(nir_shader * nir)390 brw_nir_lower_per_view_outputs(nir_shader *nir)
391 {
392    return nir_shader_intrinsics_pass(nir, lower_per_view_outputs,
393                                      nir_metadata_control_flow,
394                                      NULL);
395 }
396 
397 void
brw_nir_lower_vs_inputs(nir_shader * nir)398 brw_nir_lower_vs_inputs(nir_shader *nir)
399 {
400    /* Start with the location of the variable's base. */
401    nir_foreach_shader_in_variable(var, nir)
402       var->data.driver_location = var->data.location;
403 
404    /* Now use nir_lower_io to walk dereference chains.  Attribute arrays are
405     * loaded as one vec4 or dvec4 per element (or matrix column), depending on
406     * whether it is a double-precision type or not.
407     */
408    nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
409                 nir_lower_io_lower_64bit_to_32);
410 
411    /* This pass needs actual constants */
412    nir_opt_constant_folding(nir);
413 
414    nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
415 
416    /* The last step is to remap VERT_ATTRIB_* to actual registers */
417 
418    /* Whether or not we have any system generated values.  gl_DrawID is not
419     * included here as it lives in its own vec4.
420     */
421    const bool has_sgvs =
422       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
423       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
424       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
425       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
426 
427    const unsigned num_inputs = util_bitcount64(nir->info.inputs_read);
428 
429    nir_foreach_function_impl(impl, nir) {
430       nir_builder b = nir_builder_create(impl);
431 
432       nir_foreach_block(block, impl) {
433          nir_foreach_instr_safe(instr, block) {
434             if (instr->type != nir_instr_type_intrinsic)
435                continue;
436 
437             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
438 
439             switch (intrin->intrinsic) {
440             case nir_intrinsic_load_first_vertex:
441             case nir_intrinsic_load_base_instance:
442             case nir_intrinsic_load_vertex_id_zero_base:
443             case nir_intrinsic_load_instance_id:
444             case nir_intrinsic_load_is_indexed_draw:
445             case nir_intrinsic_load_draw_id: {
446                b.cursor = nir_after_instr(&intrin->instr);
447 
448                /* gl_VertexID and friends are stored by the VF as the last
449                 * vertex element.  We convert them to load_input intrinsics at
450                 * the right location.
451                 */
452                nir_intrinsic_instr *load =
453                   nir_intrinsic_instr_create(nir, nir_intrinsic_load_input);
454                load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
455 
456                nir_intrinsic_set_base(load, num_inputs);
457                switch (intrin->intrinsic) {
458                case nir_intrinsic_load_first_vertex:
459                   nir_intrinsic_set_component(load, 0);
460                   break;
461                case nir_intrinsic_load_base_instance:
462                   nir_intrinsic_set_component(load, 1);
463                   break;
464                case nir_intrinsic_load_vertex_id_zero_base:
465                   nir_intrinsic_set_component(load, 2);
466                   break;
467                case nir_intrinsic_load_instance_id:
468                   nir_intrinsic_set_component(load, 3);
469                   break;
470                case nir_intrinsic_load_draw_id:
471                case nir_intrinsic_load_is_indexed_draw:
472                   /* gl_DrawID and IsIndexedDraw are stored right after
473                    * gl_VertexID and friends if any of them exist.
474                    */
475                   nir_intrinsic_set_base(load, num_inputs + has_sgvs);
476                   if (intrin->intrinsic == nir_intrinsic_load_draw_id)
477                      nir_intrinsic_set_component(load, 0);
478                   else
479                      nir_intrinsic_set_component(load, 1);
480                   break;
481                default:
482                   unreachable("Invalid system value intrinsic");
483                }
484 
485                load->num_components = 1;
486                nir_def_init(&load->instr, &load->def, 1, 32);
487                nir_builder_instr_insert(&b, &load->instr);
488 
489                nir_def_replace(&intrin->def, &load->def);
490                break;
491             }
492 
493             case nir_intrinsic_load_input: {
494                /* Attributes come in a contiguous block, ordered by their
495                 * gl_vert_attrib value.  That means we can compute the slot
496                 * number for an attribute by masking out the enabled attributes
497                 * before it and counting the bits.
498                 */
499                int attr = nir_intrinsic_base(intrin);
500                int slot = util_bitcount64(nir->info.inputs_read &
501                                           BITFIELD64_MASK(attr));
502                nir_intrinsic_set_base(intrin, slot);
503                break;
504             }
505 
506             default:
507                break; /* Nothing to do */
508             }
509          }
510       }
511    }
512 }
513 
514 void
brw_nir_lower_vue_inputs(nir_shader * nir,const struct intel_vue_map * vue_map)515 brw_nir_lower_vue_inputs(nir_shader *nir,
516                          const struct intel_vue_map *vue_map)
517 {
518    nir_foreach_shader_in_variable(var, nir)
519       var->data.driver_location = var->data.location;
520 
521    /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
522    nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
523                 nir_lower_io_lower_64bit_to_32);
524 
525    /* This pass needs actual constants */
526    nir_opt_constant_folding(nir);
527 
528    nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
529 
530    nir_foreach_function_impl(impl, nir) {
531       nir_foreach_block(block, impl) {
532          nir_foreach_instr(instr, block) {
533             if (instr->type != nir_instr_type_intrinsic)
534                continue;
535 
536             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
537 
538             if (intrin->intrinsic == nir_intrinsic_load_input ||
539                 intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
540                /* Offset 0 is the VUE header, which contains
541                 * VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and
542                 * VARYING_SLOT_PSIZ [.w].
543                 */
544                int varying = nir_intrinsic_base(intrin);
545                int vue_slot;
546                switch (varying) {
547                case VARYING_SLOT_PSIZ:
548                   nir_intrinsic_set_base(intrin, 0);
549                   nir_intrinsic_set_component(intrin, 3);
550                   break;
551 
552                default:
553                   vue_slot = vue_map->varying_to_slot[varying];
554                   assert(vue_slot != -1);
555                   nir_intrinsic_set_base(intrin, vue_slot);
556                   break;
557                }
558             }
559          }
560       }
561    }
562 }
563 
564 void
brw_nir_lower_tes_inputs(nir_shader * nir,const struct intel_vue_map * vue_map)565 brw_nir_lower_tes_inputs(nir_shader *nir, const struct intel_vue_map *vue_map)
566 {
567    nir_foreach_shader_in_variable(var, nir)
568       var->data.driver_location = var->data.location;
569 
570    nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
571                 nir_lower_io_lower_64bit_to_32);
572 
573    /* This pass needs actual constants */
574    nir_opt_constant_folding(nir);
575 
576    nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
577 
578    nir_foreach_function_impl(impl, nir) {
579       nir_builder b = nir_builder_create(impl);
580       nir_foreach_block(block, impl) {
581          remap_patch_urb_offsets(block, &b, vue_map,
582                                  nir->info.tess._primitive_mode);
583       }
584    }
585 }
586 
587 static bool
lower_barycentric_per_sample(nir_builder * b,nir_intrinsic_instr * intrin,UNUSED void * cb_data)588 lower_barycentric_per_sample(nir_builder *b,
589                              nir_intrinsic_instr *intrin,
590                              UNUSED void *cb_data)
591 {
592    if (intrin->intrinsic != nir_intrinsic_load_barycentric_pixel &&
593        intrin->intrinsic != nir_intrinsic_load_barycentric_centroid)
594       return false;
595 
596    b->cursor = nir_before_instr(&intrin->instr);
597    nir_def *centroid =
598       nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample,
599                            nir_intrinsic_interp_mode(intrin));
600    nir_def_replace(&intrin->def, centroid);
601    return true;
602 }
603 
604 /**
605  * Convert interpolateAtOffset() offsets from [-0.5, +0.5] floating point
606  * offsets to integer [-8, +7] offsets (in units of 1/16th of a pixel).
607  *
608  * We clamp to +7/16 on the upper end of the range, since +0.5 isn't
609  * representable in a S0.4 value; a naive conversion would give us -8/16,
610  * which is the opposite of what was intended.
611  *
612  * This is allowed by GL_ARB_gpu_shader5's quantization rules:
613  *
614  *    "Not all values of <offset> may be supported; x and y offsets may
615  *     be rounded to fixed-point values with the number of fraction bits
616  *     given by the implementation-dependent constant
617  *     FRAGMENT_INTERPOLATION_OFFSET_BITS."
618  */
619 static bool
lower_barycentric_at_offset(nir_builder * b,nir_intrinsic_instr * intrin,void * data)620 lower_barycentric_at_offset(nir_builder *b, nir_intrinsic_instr *intrin,
621                             void *data)
622 {
623    if (intrin->intrinsic != nir_intrinsic_load_barycentric_at_offset)
624       return false;
625 
626    b->cursor = nir_before_instr(&intrin->instr);
627 
628    assert(intrin->src[0].ssa);
629    nir_def *offset =
630       nir_imin(b, nir_imm_int(b, 7),
631                nir_f2i32(b, nir_fmul_imm(b, intrin->src[0].ssa, 16)));
632 
633    nir_src_rewrite(&intrin->src[0], offset);
634 
635    return true;
636 }
637 
638 void
brw_nir_lower_fs_inputs(nir_shader * nir,const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key)639 brw_nir_lower_fs_inputs(nir_shader *nir,
640                         const struct intel_device_info *devinfo,
641                         const struct brw_wm_prog_key *key)
642 {
643    nir_foreach_shader_in_variable(var, nir) {
644       var->data.driver_location = var->data.location;
645 
646       /* Apply default interpolation mode.
647        *
648        * Everything defaults to smooth except for the legacy GL color
649        * built-in variables, which might be flat depending on API state.
650        */
651       if (var->data.interpolation == INTERP_MODE_NONE) {
652          const bool flat = key->flat_shade &&
653             (var->data.location == VARYING_SLOT_COL0 ||
654              var->data.location == VARYING_SLOT_COL1);
655 
656          var->data.interpolation = flat ? INTERP_MODE_FLAT
657                                         : INTERP_MODE_SMOOTH;
658       }
659    }
660 
661    nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
662                 nir_lower_io_lower_64bit_to_32 |
663                 nir_lower_io_use_interpolated_input_intrinsics);
664    if (devinfo->ver >= 11)
665       nir_lower_interpolation(nir, ~0);
666 
667    if (key->multisample_fbo == INTEL_NEVER) {
668       nir_lower_single_sampled(nir);
669    } else if (key->persample_interp == INTEL_ALWAYS) {
670       nir_shader_intrinsics_pass(nir, lower_barycentric_per_sample,
671                                    nir_metadata_control_flow,
672                                    NULL);
673    }
674 
675    if (devinfo->ver < 20)
676       nir_shader_intrinsics_pass(nir, lower_barycentric_at_offset,
677                                  nir_metadata_control_flow,
678                                  NULL);
679 
680    /* This pass needs actual constants */
681    nir_opt_constant_folding(nir);
682 
683    nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
684 }
685 
686 void
brw_nir_lower_vue_outputs(nir_shader * nir)687 brw_nir_lower_vue_outputs(nir_shader *nir)
688 {
689    nir_foreach_shader_out_variable(var, nir) {
690       var->data.driver_location = var->data.location;
691    }
692 
693    nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
694                 nir_lower_io_lower_64bit_to_32);
695    brw_nir_lower_per_view_outputs(nir);
696 }
697 
698 void
brw_nir_lower_tcs_outputs(nir_shader * nir,const struct intel_vue_map * vue_map,enum tess_primitive_mode tes_primitive_mode)699 brw_nir_lower_tcs_outputs(nir_shader *nir, const struct intel_vue_map *vue_map,
700                           enum tess_primitive_mode tes_primitive_mode)
701 {
702    nir_foreach_shader_out_variable(var, nir) {
703       var->data.driver_location = var->data.location;
704    }
705 
706    nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
707                 nir_lower_io_lower_64bit_to_32);
708 
709    /* This pass needs actual constants */
710    nir_opt_constant_folding(nir);
711 
712    nir_io_add_const_offset_to_base(nir, nir_var_shader_out);
713 
714    nir_foreach_function_impl(impl, nir) {
715       nir_builder b = nir_builder_create(impl);
716       nir_foreach_block(block, impl) {
717          remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode);
718       }
719    }
720 }
721 
722 void
brw_nir_lower_fs_outputs(nir_shader * nir)723 brw_nir_lower_fs_outputs(nir_shader *nir)
724 {
725    nir_foreach_shader_out_variable(var, nir) {
726       var->data.driver_location =
727          SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) |
728          SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION);
729    }
730 
731    nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0);
732 }
733 
734 static bool
tag_speculative_access(nir_builder * b,nir_intrinsic_instr * intrin,void * unused)735 tag_speculative_access(nir_builder *b,
736                        nir_intrinsic_instr *intrin,
737                        void *unused)
738 {
739    if (intrin->intrinsic == nir_intrinsic_load_ubo &&
740        brw_nir_ubo_surface_index_is_pushable(intrin->src[0])) {
741       nir_intrinsic_set_access(intrin, ACCESS_CAN_SPECULATE |
742                                nir_intrinsic_access(intrin));
743       return true;
744    }
745 
746    return false;
747 }
748 
749 static bool
brw_nir_tag_speculative_access(nir_shader * nir)750 brw_nir_tag_speculative_access(nir_shader *nir)
751 {
752    return nir_shader_intrinsics_pass(nir, tag_speculative_access,
753                                      nir_metadata_all, NULL);
754 }
755 
756 #define OPT(pass, ...) ({                                  \
757    bool this_progress = false;                             \
758    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
759    if (this_progress)                                      \
760       progress = true;                                     \
761    this_progress;                                          \
762 })
763 
764 #define LOOP_OPT(pass, ...) ({                             \
765    const unsigned long this_line = __LINE__;               \
766    bool this_progress = false;                             \
767    if (opt_line == this_line)                              \
768       break;                                               \
769    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
770    if (this_progress) {                                    \
771       progress = true;                                     \
772       opt_line = this_line;                                \
773    }                                                       \
774    this_progress;                                          \
775 })
776 
777 #define LOOP_OPT_NOT_IDEMPOTENT(pass, ...) ({              \
778    bool this_progress = false;                             \
779    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
780    if (this_progress) {                                    \
781       progress = true;                                     \
782       opt_line = 0;                                        \
783    }                                                       \
784    this_progress;                                          \
785 })
786 
787 void
brw_nir_optimize(nir_shader * nir,const struct intel_device_info * devinfo)788 brw_nir_optimize(nir_shader *nir,
789                  const struct intel_device_info *devinfo)
790 {
791    bool progress;
792    unsigned lower_flrp =
793       (nir->options->lower_flrp16 ? 16 : 0) |
794       (nir->options->lower_flrp32 ? 32 : 0) |
795       (nir->options->lower_flrp64 ? 64 : 0);
796 
797    unsigned long opt_line = 0;
798    do {
799       progress = false;
800       /* This pass is causing problems with types used by OpenCL :
801        *    https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13955
802        *
803        * Running with it disabled made no difference in the resulting assembly
804        * code.
805        */
806       if (nir->info.stage != MESA_SHADER_KERNEL)
807          LOOP_OPT(nir_split_array_vars, nir_var_function_temp);
808       LOOP_OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
809       LOOP_OPT(nir_opt_deref);
810       if (LOOP_OPT(nir_opt_memcpy))
811          LOOP_OPT(nir_split_var_copies);
812       LOOP_OPT(nir_lower_vars_to_ssa);
813       if (!nir->info.var_copies_lowered) {
814          /* Only run this pass if nir_lower_var_copies was not called
815           * yet. That would lower away any copy_deref instructions and we
816           * don't want to introduce any more.
817           */
818          LOOP_OPT(nir_opt_find_array_copies);
819       }
820       LOOP_OPT(nir_opt_copy_prop_vars);
821       LOOP_OPT(nir_opt_dead_write_vars);
822       LOOP_OPT(nir_opt_combine_stores, nir_var_all);
823 
824       LOOP_OPT(nir_opt_ray_queries);
825       LOOP_OPT(nir_opt_ray_query_ranges);
826 
827       LOOP_OPT(nir_lower_alu_to_scalar, NULL, NULL);
828 
829       LOOP_OPT(nir_copy_prop);
830 
831       LOOP_OPT(nir_lower_phis_to_scalar, false);
832 
833       LOOP_OPT(nir_copy_prop);
834       LOOP_OPT(nir_opt_dce);
835       LOOP_OPT(nir_opt_cse);
836       LOOP_OPT(nir_opt_combine_stores, nir_var_all);
837 
838       /* Passing 0 to the peephole select pass causes it to convert
839        * if-statements that contain only move instructions in the branches
840        * regardless of the count.
841        *
842        * Passing 1 to the peephole select pass causes it to convert
843        * if-statements that contain at most a single ALU instruction (total)
844        * in both branches.  Before Gfx6, some math instructions were
845        * prohibitively expensive and the results of compare operations need an
846        * extra resolve step.  For these reasons, this pass is more harmful
847        * than good on those platforms.
848        *
849        * For indirect loads of uniforms (push constants), we assume that array
850        * indices will nearly always be in bounds and the cost of the load is
851        * low.  Therefore there shouldn't be a performance benefit to avoid it.
852        */
853       LOOP_OPT(nir_opt_peephole_select, 0, true, false);
854       LOOP_OPT(nir_opt_peephole_select, 8, true, true);
855 
856       LOOP_OPT(nir_opt_intrinsics);
857       LOOP_OPT(nir_opt_idiv_const, 32);
858       LOOP_OPT_NOT_IDEMPOTENT(nir_opt_algebraic);
859 
860       LOOP_OPT(nir_opt_generate_bfi);
861       LOOP_OPT(nir_opt_reassociate_bfi);
862 
863       LOOP_OPT(nir_lower_constant_convert_alu_types);
864       LOOP_OPT(nir_opt_constant_folding);
865 
866       if (lower_flrp != 0) {
867          if (LOOP_OPT(nir_lower_flrp,
868                  lower_flrp,
869                  false /* always_precise */)) {
870             LOOP_OPT(nir_opt_constant_folding);
871          }
872 
873          /* Nothing should rematerialize any flrps, so we only need to do this
874           * lowering once.
875           */
876          lower_flrp = 0;
877       }
878 
879       LOOP_OPT(nir_opt_dead_cf);
880       if (LOOP_OPT(nir_opt_loop)) {
881          /* If nir_opt_loop makes progress, then we need to clean
882           * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
883           * to make progress.
884           */
885          LOOP_OPT(nir_copy_prop);
886          LOOP_OPT(nir_opt_dce);
887       }
888       LOOP_OPT_NOT_IDEMPOTENT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
889       LOOP_OPT(nir_opt_conditional_discard);
890       if (nir->options->max_unroll_iterations != 0) {
891          LOOP_OPT_NOT_IDEMPOTENT(nir_opt_loop_unroll);
892       }
893       LOOP_OPT(nir_opt_remove_phis);
894       LOOP_OPT(nir_opt_gcm, false);
895       LOOP_OPT(nir_opt_undef);
896       LOOP_OPT(nir_lower_pack);
897    } while (progress);
898 
899    /* Workaround Gfxbench unused local sampler variable which will trigger an
900     * assert in the opt_large_constants pass.
901     */
902    OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
903 }
904 
905 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * data)906 lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
907 {
908    switch (instr->type) {
909    case nir_instr_type_alu: {
910       nir_alu_instr *alu = nir_instr_as_alu(instr);
911       switch (alu->op) {
912       case nir_op_bit_count:
913       case nir_op_ufind_msb:
914       case nir_op_ifind_msb:
915       case nir_op_find_lsb:
916          /* These are handled specially because the destination is always
917           * 32-bit and so the bit size of the instruction is given by the
918           * source.
919           */
920          return alu->src[0].src.ssa->bit_size >= 32 ? 0 : 32;
921       default:
922          break;
923       }
924 
925       if (alu->def.bit_size >= 32)
926          return 0;
927 
928       /* Note: nir_op_iabs and nir_op_ineg are not lowered here because the
929        * 8-bit ABS or NEG instruction should eventually get copy propagated
930        * into the MOV that does the type conversion.  This results in far
931        * fewer MOV instructions.
932        */
933       switch (alu->op) {
934       case nir_op_idiv:
935       case nir_op_imod:
936       case nir_op_irem:
937       case nir_op_udiv:
938       case nir_op_umod:
939       case nir_op_fceil:
940       case nir_op_ffloor:
941       case nir_op_ffract:
942       case nir_op_fround_even:
943       case nir_op_ftrunc:
944          return 32;
945       case nir_op_frcp:
946       case nir_op_frsq:
947       case nir_op_fsqrt:
948       case nir_op_fpow:
949       case nir_op_fexp2:
950       case nir_op_flog2:
951       case nir_op_fsin:
952       case nir_op_fcos:
953          return 0;
954       case nir_op_isign:
955          assert(!"Should have been lowered by nir_opt_algebraic.");
956          return 0;
957       default:
958          if (nir_op_infos[alu->op].num_inputs >= 2 &&
959              alu->def.bit_size == 8)
960             return 16;
961 
962          if (nir_alu_instr_is_comparison(alu) &&
963              alu->src[0].src.ssa->bit_size == 8)
964             return 16;
965 
966          return 0;
967       }
968       break;
969    }
970 
971    case nir_instr_type_intrinsic: {
972       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
973       switch (intrin->intrinsic) {
974       case nir_intrinsic_read_invocation:
975       case nir_intrinsic_read_first_invocation:
976       case nir_intrinsic_vote_feq:
977       case nir_intrinsic_vote_ieq:
978       case nir_intrinsic_shuffle:
979       case nir_intrinsic_shuffle_xor:
980       case nir_intrinsic_shuffle_up:
981       case nir_intrinsic_shuffle_down:
982       case nir_intrinsic_quad_broadcast:
983       case nir_intrinsic_quad_swap_horizontal:
984       case nir_intrinsic_quad_swap_vertical:
985       case nir_intrinsic_quad_swap_diagonal:
986          if (intrin->src[0].ssa->bit_size == 8)
987             return 16;
988          return 0;
989 
990       case nir_intrinsic_reduce:
991       case nir_intrinsic_inclusive_scan:
992       case nir_intrinsic_exclusive_scan:
993          /* There are a couple of register region issues that make things
994           * complicated for 8-bit types:
995           *
996           *    1. Only raw moves are allowed to write to a packed 8-bit
997           *       destination.
998           *    2. If we use a strided destination, the efficient way to do
999           *       scan operations ends up using strides that are too big to
1000           *       encode in an instruction.
1001           *
1002           * To get around these issues, we just do all 8-bit scan operations
1003           * in 16 bits.  It's actually fewer instructions than what we'd have
1004           * to do if we were trying to do it in native 8-bit types and the
1005           * results are the same once we truncate to 8 bits at the end.
1006           */
1007          if (intrin->def.bit_size == 8)
1008             return 16;
1009          return 0;
1010 
1011       default:
1012          return 0;
1013       }
1014       break;
1015    }
1016 
1017    case nir_instr_type_phi: {
1018       nir_phi_instr *phi = nir_instr_as_phi(instr);
1019       if (phi->def.bit_size == 8)
1020          return 16;
1021       return 0;
1022    }
1023 
1024    default:
1025       return 0;
1026    }
1027 }
1028 
1029 /* On gfx12.5+, if the offsets are not both constant and in the {-8,7} range,
1030  * we will have nir_lower_tex() lower the source offset by returning true from
1031  * this filter function.
1032  */
1033 static bool
lower_xehp_tg4_offset_filter(const nir_instr * instr,UNUSED const void * data)1034 lower_xehp_tg4_offset_filter(const nir_instr *instr, UNUSED const void *data)
1035 {
1036    if (instr->type != nir_instr_type_tex)
1037       return false;
1038 
1039    nir_tex_instr *tex = nir_instr_as_tex(instr);
1040 
1041    if (tex->op != nir_texop_tg4)
1042       return false;
1043 
1044    int offset_index = nir_tex_instr_src_index(tex, nir_tex_src_offset);
1045    if (offset_index < 0)
1046       return false;
1047 
1048    if (!nir_src_is_const(tex->src[offset_index].src))
1049       return true;
1050 
1051    int64_t offset_x = nir_src_comp_as_int(tex->src[offset_index].src, 0);
1052    int64_t offset_y = nir_src_comp_as_int(tex->src[offset_index].src, 1);
1053 
1054    return offset_x < -8 || offset_x > 7 || offset_y < -8 || offset_y > 7;
1055 }
1056 
1057 /* Does some simple lowering and runs the standard suite of optimizations
1058  *
1059  * This is intended to be called more-or-less directly after you get the
1060  * shader out of GLSL or some other source.  While it is geared towards i965,
1061  * it is not at all generator-specific.
1062  */
1063 void
brw_preprocess_nir(const struct brw_compiler * compiler,nir_shader * nir,const struct brw_nir_compiler_opts * opts)1064 brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
1065                    const struct brw_nir_compiler_opts *opts)
1066 {
1067    const struct intel_device_info *devinfo = compiler->devinfo;
1068    UNUSED bool progress; /* Written by OPT */
1069 
1070    nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
1071 
1072    OPT(nir_lower_frexp);
1073 
1074    OPT(nir_lower_alu_to_scalar, NULL, NULL);
1075 
1076    if (nir->info.stage == MESA_SHADER_GEOMETRY)
1077       OPT(nir_lower_gs_intrinsics, 0);
1078 
1079    /* See also brw_nir_trig_workarounds.py */
1080    if (compiler->precise_trig &&
1081        !(devinfo->ver >= 10 || devinfo->platform == INTEL_PLATFORM_KBL))
1082       OPT(brw_nir_apply_trig_workarounds);
1083 
1084    /* This workaround existing for performance reasons. Since it requires not
1085     * setting RENDER_SURFACE_STATE::SurfaceArray when the array length is 1,
1086     * we're loosing the HW robustness feature in that case.
1087     *
1088     * So when robust image access is enabled, just avoid the workaround.
1089     */
1090    if (intel_needs_workaround(devinfo, 1806565034) && !opts->robust_image_access)
1091       OPT(intel_nir_clamp_image_1d_2d_array_sizes);
1092 
1093    const struct intel_nir_lower_texture_opts intel_tex_options = {
1094       .combined_lod_or_bias_and_offset = compiler->devinfo->ver >= 20,
1095    };
1096    OPT(intel_nir_lower_texture, &intel_tex_options);
1097 
1098    const nir_lower_tex_options tex_options = {
1099       .lower_txp = ~0,
1100       .lower_txf_offset = true,
1101       .lower_rect_offset = true,
1102       .lower_txd_cube_map = true,
1103       /* For below, See bspec 45942, "Enable new message layout for cube array" */
1104       .lower_txd_3d = devinfo->verx10 >= 125,
1105       .lower_txd_array = devinfo->verx10 >= 125,
1106       .lower_txb_shadow_clamp = true,
1107       .lower_txd_shadow_clamp = true,
1108       .lower_txd_offset_clamp = true,
1109       .lower_tg4_offsets = true,
1110       .lower_txs_lod = true, /* Wa_14012320009 */
1111       .lower_offset_filter =
1112          devinfo->verx10 >= 125 ? lower_xehp_tg4_offset_filter : NULL,
1113       .lower_invalid_implicit_lod = true,
1114    };
1115 
1116    /* In the case where TG4 coords are lowered to offsets and we have a
1117     * lower_xehp_tg4_offset_filter lowering those offsets further, we need to
1118     * rerun the pass because the instructions inserted by the first lowering
1119     * are not visible during that first pass.
1120     */
1121    if (OPT(nir_lower_tex, &tex_options)) {
1122       OPT(intel_nir_lower_texture, &intel_tex_options);
1123       OPT(nir_lower_tex, &tex_options);
1124    }
1125 
1126    OPT(nir_normalize_cubemap_coords);
1127 
1128    OPT(nir_lower_global_vars_to_local);
1129 
1130    OPT(nir_split_var_copies);
1131    OPT(nir_split_struct_vars, nir_var_function_temp);
1132 
1133    brw_nir_optimize(nir, devinfo);
1134 
1135    struct nir_opt_16bit_tex_image_options options = {
1136       .rounding_mode = nir_rounding_mode_undef,
1137       .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
1138    };
1139    OPT(nir_opt_16bit_tex_image, &options);
1140 
1141    OPT(nir_lower_doubles, opts->softfp64, nir->options->lower_doubles_options);
1142    if (OPT(nir_lower_int64_float_conversions)) {
1143       OPT(nir_opt_algebraic);
1144       OPT(nir_lower_doubles, opts->softfp64,
1145           nir->options->lower_doubles_options);
1146    }
1147 
1148    OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
1149 
1150    /* Lower a bunch of stuff */
1151    OPT(nir_lower_var_copies);
1152 
1153    /* This needs to be run after the first optimization pass but before we
1154     * lower indirect derefs away
1155     */
1156    OPT(nir_opt_large_constants, NULL, 32);
1157 
1158    OPT(nir_lower_load_const_to_scalar);
1159 
1160    OPT(nir_lower_system_values);
1161    nir_lower_compute_system_values_options lower_csv_options = {
1162       .has_base_workgroup_id = nir->info.stage == MESA_SHADER_COMPUTE,
1163    };
1164    OPT(nir_lower_compute_system_values, &lower_csv_options);
1165 
1166    const nir_lower_subgroups_options subgroups_options = {
1167       .ballot_bit_size = 32,
1168       .ballot_components = 1,
1169       .lower_to_scalar = true,
1170       .lower_relative_shuffle = true,
1171       .lower_quad_broadcast_dynamic = true,
1172       .lower_elect = true,
1173       .lower_inverse_ballot = true,
1174       .lower_rotate_to_shuffle = true,
1175    };
1176    OPT(nir_lower_subgroups, &subgroups_options);
1177 
1178    nir_variable_mode indirect_mask =
1179       brw_nir_no_indirect_mask(compiler, nir->info.stage);
1180    OPT(nir_lower_indirect_derefs, indirect_mask, UINT32_MAX);
1181 
1182    /* Even in cases where we can handle indirect temporaries via scratch, we
1183     * it can still be expensive.  Lower indirects on small arrays to
1184     * conditional load/stores.
1185     *
1186     * The threshold of 16 was chosen semi-arbitrarily.  The idea is that an
1187     * indirect on an array of 16 elements is about 30 instructions at which
1188     * point, you may be better off doing a send.  With a SIMD8 program, 16
1189     * floats is 1/8 of the entire register file.  Any array larger than that
1190     * is likely to cause pressure issues.  Also, this value is sufficiently
1191     * high that the benchmarks known to suffer from large temporary array
1192     * issues are helped but nothing else in shader-db is hurt except for maybe
1193     * that one kerbal space program shader.
1194     */
1195    if (!(indirect_mask & nir_var_function_temp))
1196       OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);
1197 
1198    /* Lower array derefs of vectors for SSBO and UBO loads.  For both UBOs and
1199     * SSBOs, our back-end is capable of loading an entire vec4 at a time and
1200     * we would like to take advantage of that whenever possible regardless of
1201     * whether or not the app gives us full loads.  This should allow the
1202     * optimizer to combine UBO and SSBO load operations and save us some send
1203     * messages.
1204     */
1205    OPT(nir_lower_array_deref_of_vec,
1206        nir_var_mem_ubo | nir_var_mem_ssbo, NULL,
1207        nir_lower_direct_array_deref_of_vec_load);
1208 
1209    /* Clamp load_per_vertex_input of the TCS stage so that we do not generate
1210     * loads reading out of bounds. We can do this here because we called
1211     * nir_lower_system_values above.
1212     */
1213    if (nir->info.stage == MESA_SHADER_TESS_CTRL &&
1214        compiler->use_tcs_multi_patch)
1215       OPT(intel_nir_clamp_per_vertex_loads);
1216 
1217    /* Get rid of split copies */
1218    brw_nir_optimize(nir, devinfo);
1219 }
1220 
1221 static bool
brw_nir_zero_inputs_instr(struct nir_builder * b,nir_intrinsic_instr * intrin,void * data)1222 brw_nir_zero_inputs_instr(struct nir_builder *b, nir_intrinsic_instr *intrin,
1223                           void *data)
1224 {
1225    if (intrin->intrinsic != nir_intrinsic_load_deref)
1226       return false;
1227 
1228    nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
1229    if (!nir_deref_mode_is(deref, nir_var_shader_in))
1230       return false;
1231 
1232    if (deref->deref_type != nir_deref_type_var)
1233       return false;
1234 
1235    nir_variable *var = deref->var;
1236 
1237    uint64_t zero_inputs = *(uint64_t *)data;
1238    if (!(BITFIELD64_BIT(var->data.location) & zero_inputs))
1239       return false;
1240 
1241    b->cursor = nir_before_instr(&intrin->instr);
1242 
1243    nir_def *zero = nir_imm_zero(b, 1, 32);
1244 
1245    nir_def_replace(&intrin->def, zero);
1246 
1247    return true;
1248 }
1249 
1250 static bool
brw_nir_zero_inputs(nir_shader * shader,uint64_t * zero_inputs)1251 brw_nir_zero_inputs(nir_shader *shader, uint64_t *zero_inputs)
1252 {
1253    return nir_shader_intrinsics_pass(shader, brw_nir_zero_inputs_instr,
1254                                      nir_metadata_control_flow,
1255                                      zero_inputs);
1256 }
1257 
1258 /* Code for Wa_18019110168 may have created input/output variables beyond
1259  * VARYING_SLOT_MAX and removed uses of variables below VARYING_SLOT_MAX.
1260  * Clean it up, so they all stay below VARYING_SLOT_MAX.
1261  */
1262 static void
brw_mesh_compact_io(nir_shader * mesh,nir_shader * frag)1263 brw_mesh_compact_io(nir_shader *mesh, nir_shader *frag)
1264 {
1265    gl_varying_slot mapping[VARYING_SLOT_MAX] = {0, };
1266    gl_varying_slot cur = VARYING_SLOT_VAR0;
1267    bool compact = false;
1268 
1269    nir_foreach_shader_out_variable(var, mesh) {
1270       gl_varying_slot location = var->data.location;
1271       if (location < VARYING_SLOT_VAR0)
1272          continue;
1273       assert(location < ARRAY_SIZE(mapping));
1274 
1275       const struct glsl_type *type = var->type;
1276       if (nir_is_arrayed_io(var, MESA_SHADER_MESH)) {
1277          assert(glsl_type_is_array(type));
1278          type = glsl_get_array_element(type);
1279       }
1280 
1281       if (mapping[location])
1282          continue;
1283 
1284       unsigned num_slots = glsl_count_attribute_slots(type, false);
1285 
1286       compact |= location + num_slots > VARYING_SLOT_MAX;
1287 
1288       mapping[location] = cur;
1289       cur += num_slots;
1290    }
1291 
1292    if (!compact)
1293       return;
1294 
1295    /* The rest of this function should be hit only for Wa_18019110168. */
1296 
1297    nir_foreach_shader_out_variable(var, mesh) {
1298       gl_varying_slot location = var->data.location;
1299       if (location < VARYING_SLOT_VAR0)
1300          continue;
1301       location = mapping[location];
1302       if (location == 0)
1303          continue;
1304       var->data.location = location;
1305    }
1306 
1307    nir_foreach_shader_in_variable(var, frag) {
1308       gl_varying_slot location = var->data.location;
1309       if (location < VARYING_SLOT_VAR0)
1310          continue;
1311       location = mapping[location];
1312       if (location == 0)
1313          continue;
1314       var->data.location = location;
1315    }
1316 
1317    nir_shader_gather_info(mesh, nir_shader_get_entrypoint(mesh));
1318    nir_shader_gather_info(frag, nir_shader_get_entrypoint(frag));
1319 
1320    if (should_print_nir(mesh)) {
1321       printf("%s\n", __func__);
1322       nir_print_shader(mesh, stdout);
1323    }
1324    if (should_print_nir(frag)) {
1325       printf("%s\n", __func__);
1326       nir_print_shader(frag, stdout);
1327    }
1328 }
1329 
1330 void
brw_nir_link_shaders(const struct brw_compiler * compiler,nir_shader * producer,nir_shader * consumer)1331 brw_nir_link_shaders(const struct brw_compiler *compiler,
1332                      nir_shader *producer, nir_shader *consumer)
1333 {
1334    const struct intel_device_info *devinfo = compiler->devinfo;
1335 
1336    if (producer->info.stage == MESA_SHADER_MESH &&
1337        consumer->info.stage == MESA_SHADER_FRAGMENT) {
1338       uint64_t fs_inputs = 0, ms_outputs = 0;
1339       /* gl_MeshPerPrimitiveEXT[].gl_ViewportIndex, gl_PrimitiveID and gl_Layer
1340        * are per primitive, but fragment shader does not have them marked as
1341        * such. Add the annotation here.
1342        */
1343       nir_foreach_shader_in_variable(var, consumer) {
1344          fs_inputs |= BITFIELD64_BIT(var->data.location);
1345 
1346          switch (var->data.location) {
1347             case VARYING_SLOT_LAYER:
1348             case VARYING_SLOT_PRIMITIVE_ID:
1349             case VARYING_SLOT_VIEWPORT:
1350                var->data.per_primitive = 1;
1351                break;
1352             default:
1353                continue;
1354          }
1355       }
1356 
1357       nir_foreach_shader_out_variable(var, producer)
1358          ms_outputs |= BITFIELD64_BIT(var->data.location);
1359 
1360       uint64_t zero_inputs = ~ms_outputs & fs_inputs;
1361       zero_inputs &= BITFIELD64_BIT(VARYING_SLOT_LAYER) |
1362                      BITFIELD64_BIT(VARYING_SLOT_VIEWPORT);
1363 
1364       if (zero_inputs)
1365          NIR_PASS(_, consumer, brw_nir_zero_inputs, &zero_inputs);
1366    }
1367 
1368    nir_lower_io_arrays_to_elements(producer, consumer);
1369    nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
1370    nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
1371 
1372    NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
1373    NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
1374    brw_nir_optimize(producer, devinfo);
1375    brw_nir_optimize(consumer, devinfo);
1376 
1377    if (nir_link_opt_varyings(producer, consumer))
1378       brw_nir_optimize(consumer, devinfo);
1379 
1380    NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
1381    NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
1382 
1383    if (nir_remove_unused_varyings(producer, consumer)) {
1384       if (should_print_nir(producer)) {
1385          printf("nir_remove_unused_varyings\n");
1386          nir_print_shader(producer, stdout);
1387       }
1388       if (should_print_nir(consumer)) {
1389          printf("nir_remove_unused_varyings\n");
1390          nir_print_shader(consumer, stdout);
1391       }
1392 
1393       NIR_PASS(_, producer, nir_lower_global_vars_to_local);
1394       NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
1395 
1396       /* The backend might not be able to handle indirects on
1397        * temporaries so we need to lower indirects on any of the
1398        * varyings we have demoted here.
1399        */
1400       NIR_PASS(_, producer, nir_lower_indirect_derefs,
1401                   brw_nir_no_indirect_mask(compiler, producer->info.stage),
1402                   UINT32_MAX);
1403       NIR_PASS(_, consumer, nir_lower_indirect_derefs,
1404                   brw_nir_no_indirect_mask(compiler, consumer->info.stage),
1405                   UINT32_MAX);
1406 
1407       brw_nir_optimize(producer, devinfo);
1408       brw_nir_optimize(consumer, devinfo);
1409 
1410       if (producer->info.stage == MESA_SHADER_MESH &&
1411             consumer->info.stage == MESA_SHADER_FRAGMENT) {
1412          brw_mesh_compact_io(producer, consumer);
1413       }
1414    }
1415 
1416    NIR_PASS(_, producer, nir_lower_io_to_vector, nir_var_shader_out);
1417 
1418    if (producer->info.stage == MESA_SHADER_TESS_CTRL &&
1419        producer->options->vectorize_tess_levels)
1420    NIR_PASS_V(producer, nir_vectorize_tess_levels);
1421 
1422    NIR_PASS(_, producer, nir_opt_combine_stores, nir_var_shader_out);
1423    NIR_PASS(_, consumer, nir_lower_io_to_vector, nir_var_shader_in);
1424 
1425    if (producer->info.stage != MESA_SHADER_TESS_CTRL &&
1426        producer->info.stage != MESA_SHADER_MESH &&
1427        producer->info.stage != MESA_SHADER_TASK) {
1428       /* Calling lower_io_to_vector creates output variable writes with
1429        * write-masks.  On non-TCS outputs, the back-end can't handle it and we
1430        * need to call nir_lower_io_to_temporaries to get rid of them.  This,
1431        * in turn, creates temporary variables and extra copy_deref intrinsics
1432        * that we need to clean up.
1433        *
1434        * Note Mesh/Task don't support I/O as temporaries (I/O is shared
1435        * between whole workgroup, possibly using multiple HW threads). For
1436        * those write-mask in output is handled by I/O lowering.
1437        */
1438       NIR_PASS_V(producer, nir_lower_io_to_temporaries,
1439                  nir_shader_get_entrypoint(producer), true, false);
1440       NIR_PASS(_, producer, nir_lower_global_vars_to_local);
1441       NIR_PASS(_, producer, nir_split_var_copies);
1442       NIR_PASS(_, producer, nir_lower_var_copies);
1443    }
1444 
1445    if (producer->info.stage == MESA_SHADER_TASK &&
1446          consumer->info.stage == MESA_SHADER_MESH) {
1447 
1448       for (unsigned i = 0; i < 3; ++i)
1449          assert(producer->info.mesh.ts_mesh_dispatch_dimensions[i] <= UINT16_MAX);
1450 
1451       nir_lower_compute_system_values_options options = {
1452             .lower_workgroup_id_to_index = true,
1453             .num_workgroups[0] = producer->info.mesh.ts_mesh_dispatch_dimensions[0],
1454             .num_workgroups[1] = producer->info.mesh.ts_mesh_dispatch_dimensions[1],
1455             .num_workgroups[2] = producer->info.mesh.ts_mesh_dispatch_dimensions[2],
1456             /* nir_lower_idiv generates expensive code */
1457             .shortcut_1d_workgroup_id = compiler->devinfo->verx10 >= 125,
1458       };
1459 
1460       NIR_PASS(_, consumer, nir_lower_compute_system_values, &options);
1461    }
1462 }
1463 
1464 bool
brw_nir_should_vectorize_mem(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)1465 brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
1466                              unsigned bit_size,
1467                              unsigned num_components,
1468                              int64_t hole_size,
1469                              nir_intrinsic_instr *low,
1470                              nir_intrinsic_instr *high,
1471                              void *data)
1472 {
1473    /* Don't combine things to generate 64-bit loads/stores.  We have to split
1474     * those back into 32-bit ones anyway and UBO loads aren't split in NIR so
1475     * we don't want to make a mess for the back-end.
1476     */
1477    if (bit_size > 32)
1478       return false;
1479 
1480    if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
1481        low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
1482        low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
1483        low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
1484       if (num_components > 4) {
1485          if (bit_size != 32)
1486             return false;
1487 
1488          if (num_components > 32)
1489             return false;
1490 
1491          if (hole_size >= 8 * 4)
1492             return false;
1493       }
1494    } else {
1495       /* We can handle at most a vec4 right now.  Anything bigger would get
1496        * immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
1497        */
1498       if (num_components > 4)
1499          return false;
1500 
1501       if (hole_size > 4)
1502          return false;
1503    }
1504 
1505 
1506    const uint32_t align = nir_combined_align(align_mul, align_offset);
1507 
1508    if (align < bit_size / 8)
1509       return false;
1510 
1511    return true;
1512 }
1513 
1514 static
combine_all_memory_barriers(nir_intrinsic_instr * a,nir_intrinsic_instr * b,void * data)1515 bool combine_all_memory_barriers(nir_intrinsic_instr *a,
1516                                  nir_intrinsic_instr *b,
1517                                  void *data)
1518 {
1519    /* Combine control barriers with identical memory semantics. This prevents
1520     * the second barrier generating a spurious, identical fence message as the
1521     * first barrier.
1522     */
1523    if (nir_intrinsic_memory_modes(a) == nir_intrinsic_memory_modes(b) &&
1524        nir_intrinsic_memory_semantics(a) == nir_intrinsic_memory_semantics(b) &&
1525        nir_intrinsic_memory_scope(a) == nir_intrinsic_memory_scope(b)) {
1526       nir_intrinsic_set_execution_scope(a, MAX2(nir_intrinsic_execution_scope(a),
1527                                                 nir_intrinsic_execution_scope(b)));
1528       return true;
1529    }
1530 
1531    /* Only combine pure memory barriers */
1532    if ((nir_intrinsic_execution_scope(a) != SCOPE_NONE) ||
1533        (nir_intrinsic_execution_scope(b) != SCOPE_NONE))
1534       return false;
1535 
1536    /* Translation to backend IR will get rid of modes we don't care about, so
1537     * no harm in always combining them.
1538     *
1539     * TODO: While HW has only ACQUIRE|RELEASE fences, we could improve the
1540     * scheduling so that it can take advantage of the different semantics.
1541     */
1542    nir_intrinsic_set_memory_modes(a, nir_intrinsic_memory_modes(a) |
1543                                      nir_intrinsic_memory_modes(b));
1544    nir_intrinsic_set_memory_semantics(a, nir_intrinsic_memory_semantics(a) |
1545                                          nir_intrinsic_memory_semantics(b));
1546    nir_intrinsic_set_memory_scope(a, MAX2(nir_intrinsic_memory_scope(a),
1547                                           nir_intrinsic_memory_scope(b)));
1548    return true;
1549 }
1550 
1551 static nir_mem_access_size_align
get_mem_access_size_align(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)1552 get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes,
1553                           uint8_t bit_size, uint32_t align_mul, uint32_t align_offset,
1554                           bool offset_is_const, enum gl_access_qualifier access,
1555                           const void *cb_data)
1556 {
1557    const uint32_t align = nir_combined_align(align_mul, align_offset);
1558 
1559    switch (intrin) {
1560    case nir_intrinsic_load_ssbo:
1561    case nir_intrinsic_load_shared:
1562    case nir_intrinsic_load_scratch:
1563       /* The offset is constant so we can use a 32-bit load and just shift it
1564        * around as needed.
1565        */
1566       if (align < 4 && offset_is_const) {
1567          assert(util_is_power_of_two_nonzero(align_mul) && align_mul >= 4);
1568          const unsigned pad = align_offset % 4;
1569          const unsigned comps32 = MIN2(DIV_ROUND_UP(bytes + pad, 4), 4);
1570          return (nir_mem_access_size_align) {
1571             .bit_size = 32,
1572             .num_components = comps32,
1573             .align = 4,
1574             .shift = nir_mem_access_shift_method_scalar,
1575          };
1576       }
1577       break;
1578 
1579    case nir_intrinsic_load_task_payload:
1580       if (bytes < 4 || align < 4) {
1581          return (nir_mem_access_size_align) {
1582             .bit_size = 32,
1583             .num_components = 1,
1584             .align = 4,
1585             .shift = nir_mem_access_shift_method_scalar,
1586          };
1587       }
1588       break;
1589 
1590    default:
1591       break;
1592    }
1593 
1594    const bool is_load = nir_intrinsic_infos[intrin].has_dest;
1595    const bool is_scratch = intrin == nir_intrinsic_load_scratch ||
1596                            intrin == nir_intrinsic_store_scratch;
1597 
1598    if (align < 4 || bytes < 4) {
1599       /* Choose a byte, word, or dword */
1600       bytes = MIN2(bytes, 4);
1601       if (bytes == 3)
1602          bytes = is_load ? 4 : 2;
1603 
1604       if (is_scratch) {
1605          /* The way scratch address swizzling works in the back-end, it
1606           * happens at a DWORD granularity so we can't have a single load
1607           * or store cross a DWORD boundary.
1608           */
1609          if ((align_offset % 4) + bytes > MIN2(align_mul, 4))
1610             bytes = MIN2(align_mul, 4) - (align_offset % 4);
1611 
1612          /* Must be a power of two */
1613          if (bytes == 3)
1614             bytes = 2;
1615       }
1616 
1617       return (nir_mem_access_size_align) {
1618          .bit_size = bytes * 8,
1619          .num_components = 1,
1620          .align = 1,
1621          .shift = nir_mem_access_shift_method_scalar,
1622       };
1623    } else {
1624       bytes = MIN2(bytes, 16);
1625       return (nir_mem_access_size_align) {
1626          .bit_size = 32,
1627          .num_components = is_scratch ? 1 :
1628                            is_load ? DIV_ROUND_UP(bytes, 4) : bytes / 4,
1629          .align = 4,
1630          .shift = nir_mem_access_shift_method_scalar,
1631       };
1632    }
1633 }
1634 
1635 static void
brw_vectorize_lower_mem_access(nir_shader * nir,const struct brw_compiler * compiler,enum brw_robustness_flags robust_flags)1636 brw_vectorize_lower_mem_access(nir_shader *nir,
1637                                const struct brw_compiler *compiler,
1638                                enum brw_robustness_flags robust_flags)
1639 {
1640    bool progress = false;
1641 
1642    nir_load_store_vectorize_options options = {
1643       .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
1644                nir_var_mem_global | nir_var_mem_shared |
1645                nir_var_mem_task_payload,
1646       .callback = brw_nir_should_vectorize_mem,
1647       .robust_modes = (nir_variable_mode)0,
1648    };
1649 
1650    if (robust_flags & BRW_ROBUSTNESS_UBO)
1651       options.robust_modes |= nir_var_mem_ubo;
1652    if (robust_flags & BRW_ROBUSTNESS_SSBO)
1653       options.robust_modes |= nir_var_mem_ssbo;
1654 
1655    OPT(nir_opt_load_store_vectorize, &options);
1656 
1657    /* When HW supports block loads, using the divergence analysis, try
1658     * to find uniform SSBO loads and turn them into block loads.
1659     *
1660     * Rerun the vectorizer after that to make the largest possible block
1661     * loads.
1662     *
1663     * This is a win on 2 fronts :
1664     *   - fewer send messages
1665     *   - reduced register pressure
1666     */
1667    nir_divergence_analysis(nir);
1668    if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo)) {
1669       OPT(nir_opt_load_store_vectorize, &options);
1670 
1671       OPT(nir_opt_constant_folding);
1672       OPT(nir_copy_prop);
1673 
1674       if (OPT(brw_nir_rebase_const_offset_ubo_loads)) {
1675          OPT(nir_opt_cse);
1676          OPT(nir_copy_prop);
1677 
1678          nir_load_store_vectorize_options ubo_options = {
1679             .modes = nir_var_mem_ubo,
1680             .callback = brw_nir_should_vectorize_mem,
1681             .robust_modes = options.robust_modes & nir_var_mem_ubo,
1682          };
1683 
1684          OPT(nir_opt_load_store_vectorize, &ubo_options);
1685       }
1686    }
1687 
1688    nir_lower_mem_access_bit_sizes_options mem_access_options = {
1689       .modes = nir_var_mem_ssbo |
1690                nir_var_mem_constant |
1691                nir_var_mem_task_payload |
1692                nir_var_shader_temp |
1693                nir_var_function_temp |
1694                nir_var_mem_global |
1695                nir_var_mem_shared,
1696       .callback = get_mem_access_size_align,
1697    };
1698    OPT(nir_lower_mem_access_bit_sizes, &mem_access_options);
1699 
1700    while (progress) {
1701       progress = false;
1702 
1703       OPT(nir_lower_pack);
1704       OPT(nir_copy_prop);
1705       OPT(nir_opt_dce);
1706       OPT(nir_opt_cse);
1707       OPT(nir_opt_algebraic);
1708       OPT(nir_opt_constant_folding);
1709    }
1710 }
1711 
1712 static bool
nir_shader_has_local_variables(const nir_shader * nir)1713 nir_shader_has_local_variables(const nir_shader *nir)
1714 {
1715    nir_foreach_function_impl(impl, nir) {
1716       if (!exec_list_is_empty(&impl->locals))
1717          return true;
1718    }
1719 
1720    return false;
1721 }
1722 
1723 /* Prepare the given shader for codegen
1724  *
1725  * This function is intended to be called right before going into the actual
1726  * backend and is highly backend-specific.  Also, once this function has been
1727  * called on a shader, it will no longer be in SSA form so most optimizations
1728  * will not work.
1729  */
1730 void
brw_postprocess_nir(nir_shader * nir,const struct brw_compiler * compiler,bool debug_enabled,enum brw_robustness_flags robust_flags)1731 brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
1732                     bool debug_enabled,
1733                     enum brw_robustness_flags robust_flags)
1734 {
1735    const struct intel_device_info *devinfo = compiler->devinfo;
1736 
1737    UNUSED bool progress; /* Written by OPT */
1738 
1739    OPT(intel_nir_lower_sparse_intrinsics);
1740 
1741    OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
1742 
1743    OPT(nir_opt_combine_barriers, combine_all_memory_barriers, NULL);
1744 
1745    do {
1746       progress = false;
1747       OPT(nir_opt_algebraic_before_ffma);
1748    } while (progress);
1749 
1750    if (devinfo->verx10 >= 125) {
1751       /* Lower integer division by constants before nir_lower_idiv. */
1752       OPT(nir_opt_idiv_const, 32);
1753       const nir_lower_idiv_options options = {
1754          .allow_fp16 = false
1755       };
1756       OPT(nir_lower_idiv, &options);
1757    }
1758 
1759    if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
1760       NIR_PASS(_, nir, intel_nir_lower_shading_rate_output);
1761 
1762    OPT(brw_nir_tag_speculative_access);
1763 
1764    brw_nir_optimize(nir, devinfo);
1765 
1766    if (nir_shader_has_local_variables(nir)) {
1767       OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
1768           glsl_get_natural_size_align_bytes);
1769       OPT(nir_lower_explicit_io, nir_var_function_temp,
1770           nir_address_format_32bit_offset);
1771       brw_nir_optimize(nir, devinfo);
1772    }
1773 
1774    brw_vectorize_lower_mem_access(nir, compiler, robust_flags);
1775 
1776    /* Needs to be prior int64 lower because it generates 64bit address
1777     * manipulations
1778     */
1779    OPT(intel_nir_lower_printf);
1780 
1781    /* Potentially perform this optimization pass twice because it can create
1782     * additional opportunities for itself.
1783     */
1784    if (OPT(nir_opt_algebraic_before_lower_int64))
1785       OPT(nir_opt_algebraic_before_lower_int64);
1786 
1787    if (OPT(nir_lower_int64))
1788       brw_nir_optimize(nir, devinfo);
1789 
1790    /* Try and fuse multiply-adds, if successful, run shrink_vectors to
1791     * avoid peephole_ffma to generate things like this :
1792     *    vec16 ssa_0 = ...
1793     *    vec16 ssa_1 = fneg ssa_0
1794     *    vec1  ssa_2 = ffma ssa_1, ...
1795     *
1796     * We want this instead :
1797     *    vec16 ssa_0 = ...
1798     *    vec1  ssa_1 = fneg ssa_0.x
1799     *    vec1  ssa_2 = ffma ssa_1, ...
1800     */
1801    if (OPT(intel_nir_opt_peephole_ffma))
1802       OPT(nir_opt_shrink_vectors, false);
1803 
1804    OPT(intel_nir_opt_peephole_imul32x16);
1805 
1806    if (OPT(nir_opt_comparison_pre)) {
1807       OPT(nir_copy_prop);
1808       OPT(nir_opt_dce);
1809       OPT(nir_opt_cse);
1810 
1811       /* Do the select peepehole again.  nir_opt_comparison_pre (combined with
1812        * the other optimization passes) will have removed at least one
1813        * instruction from one of the branches of the if-statement, so now it
1814        * might be under the threshold of conversion to bcsel.
1815        */
1816       OPT(nir_opt_peephole_select, 0, false, false);
1817       OPT(nir_opt_peephole_select, 1, false, true);
1818    }
1819 
1820    do {
1821       progress = false;
1822 
1823       OPT(brw_nir_opt_fsat);
1824       OPT(nir_opt_algebraic_late);
1825       OPT(brw_nir_lower_fsign);
1826 
1827       if (progress) {
1828          OPT(nir_opt_constant_folding);
1829          OPT(nir_copy_prop);
1830          OPT(nir_opt_dce);
1831          OPT(nir_opt_cse);
1832       }
1833    } while (progress);
1834 
1835 
1836    if (OPT(nir_lower_fp16_casts, nir_lower_fp16_split_fp64)) {
1837       if (OPT(nir_lower_int64)) {
1838          brw_nir_optimize(nir, devinfo);
1839       }
1840    }
1841 
1842    OPT(nir_lower_alu_to_scalar, NULL, NULL);
1843 
1844    while (OPT(nir_opt_algebraic_distribute_src_mods)) {
1845       OPT(nir_opt_constant_folding);
1846       OPT(nir_copy_prop);
1847       OPT(nir_opt_dce);
1848       OPT(nir_opt_cse);
1849    }
1850 
1851    OPT(nir_copy_prop);
1852    OPT(nir_opt_dce);
1853    OPT(nir_opt_move, nir_move_comparisons);
1854    OPT(nir_opt_dead_cf);
1855 
1856    bool divergence_analysis_dirty = false;
1857    NIR_PASS_V(nir, nir_divergence_analysis);
1858 
1859    static const nir_lower_subgroups_options subgroups_options = {
1860       .ballot_bit_size = 32,
1861       .ballot_components = 1,
1862       .lower_elect = true,
1863       .lower_subgroup_masks = true,
1864    };
1865 
1866    if (OPT(nir_opt_uniform_atomics, false)) {
1867       OPT(nir_lower_subgroups, &subgroups_options);
1868 
1869       OPT(nir_opt_algebraic_before_lower_int64);
1870 
1871       if (OPT(nir_lower_int64))
1872          brw_nir_optimize(nir, devinfo);
1873 
1874       divergence_analysis_dirty = true;
1875    }
1876 
1877    /* nir_opt_uniform_subgroup can create some operations (e.g.,
1878     * load_subgroup_lt_mask) that need to be lowered again.
1879     */
1880    if (OPT(nir_opt_uniform_subgroup, &subgroups_options)) {
1881       /* Some of the optimizations can generate 64-bit integer multiplication
1882        * that must be lowered.
1883        */
1884       OPT(nir_lower_int64);
1885 
1886       /* Even if nir_lower_int64 did not make progress, re-run the main
1887        * optimization loop. nir_opt_uniform_subgroup may have made some things
1888        * that previously appeared divergent be marked as convergent. This
1889        * allows the elimination of some loops over, say, a TXF instruction
1890        * with a non-uniform texture handle.
1891        */
1892       brw_nir_optimize(nir, devinfo);
1893 
1894       OPT(nir_lower_subgroups, &subgroups_options);
1895    }
1896 
1897    /* Run intel_nir_lower_conversions only after the last tiem
1898     * brw_nir_optimize is called. Various optimizations invoked there can
1899     * rematerialize the conversions that the lowering pass eliminates.
1900     */
1901    OPT(intel_nir_lower_conversions);
1902 
1903    /* Do this only after the last opt_gcm. GCM will undo this lowering. */
1904    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1905       if (divergence_analysis_dirty) {
1906          NIR_PASS_V(nir, nir_divergence_analysis);
1907       }
1908 
1909       OPT(intel_nir_lower_non_uniform_barycentric_at_sample);
1910    }
1911 
1912    OPT(nir_lower_bool_to_int32);
1913    OPT(nir_copy_prop);
1914    OPT(nir_opt_dce);
1915 
1916    OPT(nir_lower_locals_to_regs, 32);
1917 
1918    if (unlikely(debug_enabled)) {
1919       /* Re-index SSA defs so we print more sensible numbers. */
1920       nir_foreach_function_impl(impl, nir) {
1921          nir_index_ssa_defs(impl);
1922       }
1923 
1924       fprintf(stderr, "NIR (SSA form) for %s shader:\n",
1925               _mesa_shader_stage_to_string(nir->info.stage));
1926       nir_print_shader(nir, stderr);
1927    }
1928 
1929    nir_validate_ssa_dominance(nir, "before nir_convert_from_ssa");
1930 
1931    /* Rerun the divergence analysis before convert_from_ssa as this pass has
1932     * some assert on consistent divergence flags.
1933     */
1934    NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
1935    NIR_PASS_V(nir, nir_divergence_analysis);
1936 
1937    OPT(nir_convert_from_ssa, true);
1938 
1939    OPT(nir_opt_dce);
1940 
1941    if (OPT(nir_opt_rematerialize_compares))
1942       OPT(nir_opt_dce);
1943 
1944    /* The mesh stages require this pass to be called at the last minute,
1945     * but if anything is done by it, it will also constant fold, and that
1946     * undoes the work done by nir_trivialize_registers, so call it right
1947     * before that one instead.
1948     */
1949    if (nir->info.stage == MESA_SHADER_MESH ||
1950        nir->info.stage == MESA_SHADER_TASK)
1951       brw_nir_adjust_payload(nir);
1952 
1953    nir_trivialize_registers(nir);
1954 
1955    nir_sweep(nir);
1956 
1957    if (unlikely(debug_enabled)) {
1958       fprintf(stderr, "NIR (final form) for %s shader:\n",
1959               _mesa_shader_stage_to_string(nir->info.stage));
1960       nir_print_shader(nir, stderr);
1961    }
1962 }
1963 
1964 static unsigned
get_subgroup_size(const struct shader_info * info,unsigned max_subgroup_size)1965 get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size)
1966 {
1967    switch (info->subgroup_size) {
1968    case SUBGROUP_SIZE_API_CONSTANT:
1969       /* We have to use the global constant size. */
1970       return BRW_SUBGROUP_SIZE;
1971 
1972    case SUBGROUP_SIZE_UNIFORM:
1973       /* It has to be uniform across all invocations but can vary per stage
1974        * if we want.  This gives us a bit more freedom.
1975        *
1976        * For compute, brw_nir_apply_key is called per-dispatch-width so this
1977        * is the actual subgroup size and not a maximum.  However, we only
1978        * invoke one size of any given compute shader so it's still guaranteed
1979        * to be uniform across invocations.
1980        */
1981       return max_subgroup_size;
1982 
1983    case SUBGROUP_SIZE_VARYING:
1984       /* The subgroup size is allowed to be fully varying.  For geometry
1985        * stages, we know it's always 8 which is max_subgroup_size so we can
1986        * return that.  For compute, brw_nir_apply_key is called once per
1987        * dispatch-width so max_subgroup_size is the real subgroup size.
1988        *
1989        * For fragment, we return 0 and let it fall through to the back-end
1990        * compiler.  This means we can't optimize based on subgroup size but
1991        * that's a risk the client took when it asked for a varying subgroup
1992        * size.
1993        */
1994       return info->stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
1995 
1996    case SUBGROUP_SIZE_REQUIRE_4:
1997       unreachable("Unsupported subgroup size type");
1998 
1999    case SUBGROUP_SIZE_REQUIRE_8:
2000    case SUBGROUP_SIZE_REQUIRE_16:
2001    case SUBGROUP_SIZE_REQUIRE_32:
2002       assert(gl_shader_stage_uses_workgroup(info->stage) ||
2003              (info->stage >= MESA_SHADER_RAYGEN && info->stage <= MESA_SHADER_CALLABLE));
2004       /* These enum values are expressly chosen to be equal to the subgroup
2005        * size that they require.
2006        */
2007       return info->subgroup_size;
2008 
2009    case SUBGROUP_SIZE_FULL_SUBGROUPS:
2010    case SUBGROUP_SIZE_REQUIRE_64:
2011    case SUBGROUP_SIZE_REQUIRE_128:
2012       break;
2013    }
2014 
2015    unreachable("Invalid subgroup size type");
2016 }
2017 
2018 unsigned
brw_nir_api_subgroup_size(const nir_shader * nir,unsigned hw_subgroup_size)2019 brw_nir_api_subgroup_size(const nir_shader *nir,
2020                           unsigned hw_subgroup_size)
2021 {
2022    return get_subgroup_size(&nir->info, hw_subgroup_size);
2023 }
2024 
2025 void
brw_nir_apply_key(nir_shader * nir,const struct brw_compiler * compiler,const struct brw_base_prog_key * key,unsigned max_subgroup_size)2026 brw_nir_apply_key(nir_shader *nir,
2027                   const struct brw_compiler *compiler,
2028                   const struct brw_base_prog_key *key,
2029                   unsigned max_subgroup_size)
2030 {
2031    bool progress = false;
2032 
2033    nir_lower_tex_options nir_tex_opts = {
2034       .lower_txd_clamp_bindless_sampler = true,
2035       .lower_txd_clamp_if_sampler_index_not_lt_16 = true,
2036       .lower_invalid_implicit_lod = true,
2037       .lower_index_to_offset = true,
2038    };
2039    OPT(nir_lower_tex, &nir_tex_opts);
2040 
2041    const struct intel_nir_lower_texture_opts tex_opts = {
2042       .combined_lod_and_array_index = compiler->devinfo->ver >= 20,
2043    };
2044    OPT(intel_nir_lower_texture, &tex_opts);
2045 
2046    const nir_lower_subgroups_options subgroups_options = {
2047       .subgroup_size = get_subgroup_size(&nir->info, max_subgroup_size),
2048       .ballot_bit_size = 32,
2049       .ballot_components = 1,
2050       .lower_subgroup_masks = true,
2051    };
2052    OPT(nir_lower_subgroups, &subgroups_options);
2053 
2054    if (key->limit_trig_input_range)
2055       OPT(brw_nir_limit_trig_input_range_workaround);
2056 
2057    if (progress) {
2058       brw_nir_optimize(nir, compiler->devinfo);
2059    }
2060 }
2061 
2062 enum brw_conditional_mod
brw_cmod_for_nir_comparison(nir_op op)2063 brw_cmod_for_nir_comparison(nir_op op)
2064 {
2065    switch (op) {
2066    case nir_op_flt:
2067    case nir_op_flt32:
2068    case nir_op_ilt:
2069    case nir_op_ilt32:
2070    case nir_op_ult:
2071    case nir_op_ult32:
2072       return BRW_CONDITIONAL_L;
2073 
2074    case nir_op_fge:
2075    case nir_op_fge32:
2076    case nir_op_ige:
2077    case nir_op_ige32:
2078    case nir_op_uge:
2079    case nir_op_uge32:
2080       return BRW_CONDITIONAL_GE;
2081 
2082    case nir_op_feq:
2083    case nir_op_feq32:
2084    case nir_op_ieq:
2085    case nir_op_ieq32:
2086    case nir_op_b32all_fequal2:
2087    case nir_op_b32all_iequal2:
2088    case nir_op_b32all_fequal3:
2089    case nir_op_b32all_iequal3:
2090    case nir_op_b32all_fequal4:
2091    case nir_op_b32all_iequal4:
2092       return BRW_CONDITIONAL_Z;
2093 
2094    case nir_op_fneu:
2095    case nir_op_fneu32:
2096    case nir_op_ine:
2097    case nir_op_ine32:
2098    case nir_op_b32any_fnequal2:
2099    case nir_op_b32any_inequal2:
2100    case nir_op_b32any_fnequal3:
2101    case nir_op_b32any_inequal3:
2102    case nir_op_b32any_fnequal4:
2103    case nir_op_b32any_inequal4:
2104       return BRW_CONDITIONAL_NZ;
2105 
2106    default:
2107       unreachable("Unsupported NIR comparison op");
2108    }
2109 }
2110 
2111 enum lsc_opcode
lsc_op_for_nir_intrinsic(const nir_intrinsic_instr * intrin)2112 lsc_op_for_nir_intrinsic(const nir_intrinsic_instr *intrin)
2113 {
2114    switch (intrin->intrinsic) {
2115    case nir_intrinsic_load_ssbo:
2116    case nir_intrinsic_load_shared:
2117    case nir_intrinsic_load_global:
2118    case nir_intrinsic_load_global_block_intel:
2119    case nir_intrinsic_load_global_constant:
2120    case nir_intrinsic_load_global_constant_uniform_block_intel:
2121    case nir_intrinsic_load_shared_block_intel:
2122    case nir_intrinsic_load_shared_uniform_block_intel:
2123    case nir_intrinsic_load_ssbo_block_intel:
2124    case nir_intrinsic_load_ssbo_uniform_block_intel:
2125    case nir_intrinsic_load_ubo_uniform_block_intel:
2126    case nir_intrinsic_load_scratch:
2127       return LSC_OP_LOAD;
2128 
2129    case nir_intrinsic_store_ssbo:
2130    case nir_intrinsic_store_shared:
2131    case nir_intrinsic_store_global:
2132    case nir_intrinsic_store_global_block_intel:
2133    case nir_intrinsic_store_shared_block_intel:
2134    case nir_intrinsic_store_ssbo_block_intel:
2135    case nir_intrinsic_store_scratch:
2136       return LSC_OP_STORE;
2137 
2138    case nir_intrinsic_image_load:
2139    case nir_intrinsic_bindless_image_load:
2140       return LSC_OP_LOAD_CMASK;
2141 
2142    case nir_intrinsic_image_store:
2143    case nir_intrinsic_bindless_image_store:
2144       return LSC_OP_STORE_CMASK;
2145 
2146    default:
2147       assert(nir_intrinsic_has_atomic_op(intrin));
2148       break;
2149    }
2150 
2151    switch (nir_intrinsic_atomic_op(intrin)) {
2152    case nir_atomic_op_iadd: {
2153       unsigned src_idx;
2154       switch (intrin->intrinsic) {
2155       case nir_intrinsic_image_atomic:
2156       case nir_intrinsic_bindless_image_atomic:
2157          src_idx = 3;
2158          break;
2159       case nir_intrinsic_ssbo_atomic:
2160          src_idx = 2;
2161          break;
2162       case nir_intrinsic_shared_atomic:
2163       case nir_intrinsic_global_atomic:
2164          src_idx = 1;
2165          break;
2166       default:
2167          unreachable("Invalid add atomic opcode");
2168       }
2169 
2170       if (nir_src_is_const(intrin->src[src_idx])) {
2171          int64_t add_val = nir_src_as_int(intrin->src[src_idx]);
2172          if (add_val == 1)
2173             return LSC_OP_ATOMIC_INC;
2174          else if (add_val == -1)
2175             return LSC_OP_ATOMIC_DEC;
2176       }
2177       return LSC_OP_ATOMIC_ADD;
2178    }
2179 
2180    case nir_atomic_op_imin: return LSC_OP_ATOMIC_MIN;
2181    case nir_atomic_op_umin: return LSC_OP_ATOMIC_UMIN;
2182    case nir_atomic_op_imax: return LSC_OP_ATOMIC_MAX;
2183    case nir_atomic_op_umax: return LSC_OP_ATOMIC_UMAX;
2184    case nir_atomic_op_iand: return LSC_OP_ATOMIC_AND;
2185    case nir_atomic_op_ior:  return LSC_OP_ATOMIC_OR;
2186    case nir_atomic_op_ixor: return LSC_OP_ATOMIC_XOR;
2187    case nir_atomic_op_xchg: return LSC_OP_ATOMIC_STORE;
2188    case nir_atomic_op_cmpxchg: return LSC_OP_ATOMIC_CMPXCHG;
2189 
2190    case nir_atomic_op_fmin: return LSC_OP_ATOMIC_FMIN;
2191    case nir_atomic_op_fmax: return LSC_OP_ATOMIC_FMAX;
2192    case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG;
2193    case nir_atomic_op_fadd: return LSC_OP_ATOMIC_FADD;
2194 
2195    default:
2196       unreachable("Unsupported NIR atomic intrinsic");
2197    }
2198 }
2199 
2200 enum brw_reg_type
brw_type_for_nir_type(const struct intel_device_info * devinfo,nir_alu_type type)2201 brw_type_for_nir_type(const struct intel_device_info *devinfo,
2202                       nir_alu_type type)
2203 {
2204    switch (type) {
2205    case nir_type_uint:
2206    case nir_type_uint32:
2207       return BRW_TYPE_UD;
2208    case nir_type_bool:
2209    case nir_type_int:
2210    case nir_type_bool32:
2211    case nir_type_int32:
2212       return BRW_TYPE_D;
2213    case nir_type_float:
2214    case nir_type_float32:
2215       return BRW_TYPE_F;
2216    case nir_type_float16:
2217       return BRW_TYPE_HF;
2218    case nir_type_float64:
2219       return BRW_TYPE_DF;
2220    case nir_type_int64:
2221       return BRW_TYPE_Q;
2222    case nir_type_uint64:
2223       return BRW_TYPE_UQ;
2224    case nir_type_int16:
2225       return BRW_TYPE_W;
2226    case nir_type_uint16:
2227       return BRW_TYPE_UW;
2228    case nir_type_int8:
2229       return BRW_TYPE_B;
2230    case nir_type_uint8:
2231       return BRW_TYPE_UB;
2232    default:
2233       unreachable("unknown type");
2234    }
2235 
2236    return BRW_TYPE_F;
2237 }
2238 
2239 nir_shader *
brw_nir_create_passthrough_tcs(void * mem_ctx,const struct brw_compiler * compiler,const struct brw_tcs_prog_key * key)2240 brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compiler,
2241                                const struct brw_tcs_prog_key *key)
2242 {
2243    assert(key->input_vertices > 0);
2244 
2245    const nir_shader_compiler_options *options =
2246       compiler->nir_options[MESA_SHADER_TESS_CTRL];
2247 
2248    uint64_t inputs_read = key->outputs_written &
2249       ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
2250 
2251    unsigned locations[64];
2252    unsigned num_locations = 0;
2253 
2254    u_foreach_bit64(varying, inputs_read)
2255       locations[num_locations++] = varying;
2256 
2257    nir_shader *nir =
2258       nir_create_passthrough_tcs_impl(options, locations, num_locations,
2259                                       key->input_vertices);
2260 
2261    ralloc_steal(mem_ctx, nir);
2262 
2263    nir->info.inputs_read = inputs_read;
2264    nir->info.tess._primitive_mode = key->_tes_primitive_mode;
2265    nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs");
2266 
2267    struct brw_nir_compiler_opts opts = {};
2268    brw_preprocess_nir(compiler, nir, &opts);
2269 
2270    return nir;
2271 }
2272 
2273 nir_def *
brw_nir_load_global_const(nir_builder * b,nir_intrinsic_instr * load_uniform,nir_def * base_addr,unsigned off)2274 brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
2275       nir_def *base_addr, unsigned off)
2276 {
2277    assert(load_uniform->intrinsic == nir_intrinsic_load_uniform);
2278 
2279    unsigned bit_size = load_uniform->def.bit_size;
2280    assert(bit_size >= 8 && bit_size % 8 == 0);
2281    unsigned byte_size = bit_size / 8;
2282    nir_def *sysval;
2283 
2284    if (nir_src_is_const(load_uniform->src[0])) {
2285       uint64_t offset = off +
2286                         nir_intrinsic_base(load_uniform) +
2287                         nir_src_as_uint(load_uniform->src[0]);
2288 
2289       /* Things should be component-aligned. */
2290       assert(offset % byte_size == 0);
2291 
2292       unsigned suboffset = offset % 64;
2293       uint64_t aligned_offset = offset - suboffset;
2294 
2295       /* Load two just in case we go over a 64B boundary */
2296       nir_def *data[2];
2297       for (unsigned i = 0; i < 2; i++) {
2298          nir_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
2299 
2300          data[i] = nir_load_global_constant_uniform_block_intel(
2301             b, 16, 32, addr,
2302             .access = ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE,
2303             .align_mul = 64);
2304       }
2305 
2306       sysval = nir_extract_bits(b, data, 2, suboffset * 8,
2307                                 load_uniform->num_components, bit_size);
2308    } else {
2309       nir_def *offset32 =
2310          nir_iadd_imm(b, load_uniform->src[0].ssa,
2311                          off + nir_intrinsic_base(load_uniform));
2312       nir_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset32));
2313       sysval = nir_load_global_constant(b, addr, byte_size,
2314                                         load_uniform->num_components, bit_size);
2315    }
2316 
2317    return sysval;
2318 }
2319 
2320 const struct glsl_type *
brw_nir_get_var_type(const struct nir_shader * nir,nir_variable * var)2321 brw_nir_get_var_type(const struct nir_shader *nir, nir_variable *var)
2322 {
2323    const struct glsl_type *type = var->interface_type;
2324    if (!type) {
2325       type = var->type;
2326       if (nir_is_arrayed_io(var, nir->info.stage)) {
2327          assert(glsl_type_is_array(type));
2328          type = glsl_get_array_element(type);
2329       }
2330    }
2331 
2332    return type;
2333 }
2334 
2335 bool
brw_nir_uses_inline_data(nir_shader * shader)2336 brw_nir_uses_inline_data(nir_shader *shader)
2337 {
2338    nir_foreach_function_impl(impl, shader) {
2339       nir_foreach_block(block, impl) {
2340          nir_foreach_instr(instr, block) {
2341             if (instr->type != nir_instr_type_intrinsic)
2342                continue;
2343 
2344             nir_intrinsic_instr *intrin  = nir_instr_as_intrinsic(instr);
2345             if (intrin->intrinsic != nir_intrinsic_load_inline_data_intel)
2346                continue;
2347 
2348             return true;
2349          }
2350       }
2351    }
2352 
2353    return false;
2354 }
2355 
2356 /**
2357  * Move load_interpolated_input with simple (payload-based) barycentric modes
2358  * to the top of the program so we don't emit multiple PLNs for the same input.
2359  *
2360  * This works around CSE not being able to handle non-dominating cases
2361  * such as:
2362  *
2363  *    if (...) {
2364  *       interpolate input
2365  *    } else {
2366  *       interpolate the same exact input
2367  *    }
2368  *
2369  * This should be replaced by global value numbering someday.
2370  */
2371 bool
brw_nir_move_interpolation_to_top(nir_shader * nir)2372 brw_nir_move_interpolation_to_top(nir_shader *nir)
2373 {
2374    bool progress = false;
2375 
2376    nir_foreach_function_impl(impl, nir) {
2377       nir_block *top = nir_start_block(impl);
2378       nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
2379       bool impl_progress = false;
2380 
2381       for (nir_block *block = nir_block_cf_tree_next(top);
2382            block != NULL;
2383            block = nir_block_cf_tree_next(block)) {
2384 
2385          nir_foreach_instr_safe(instr, block) {
2386             if (instr->type != nir_instr_type_intrinsic)
2387                continue;
2388 
2389             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2390             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
2391                continue;
2392             nir_intrinsic_instr *bary_intrinsic =
2393                nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
2394             nir_intrinsic_op op = bary_intrinsic->intrinsic;
2395 
2396             /* Leave interpolateAtSample/Offset() where they are. */
2397             if (op == nir_intrinsic_load_barycentric_at_sample ||
2398                 op == nir_intrinsic_load_barycentric_at_offset)
2399                continue;
2400 
2401             nir_instr *move[3] = {
2402                &bary_intrinsic->instr,
2403                intrin->src[1].ssa->parent_instr,
2404                instr
2405             };
2406 
2407             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
2408                if (move[i]->block != top) {
2409                   nir_instr_move(cursor, move[i]);
2410                   impl_progress = true;
2411                }
2412             }
2413          }
2414       }
2415 
2416       progress = progress || impl_progress;
2417 
2418       nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
2419                                                 : nir_metadata_all);
2420    }
2421 
2422    return progress;
2423 }
2424 
2425 static bool
filter_simd(const nir_instr * instr,UNUSED const void * options)2426 filter_simd(const nir_instr *instr, UNUSED const void *options)
2427 {
2428    if (instr->type != nir_instr_type_intrinsic)
2429       return false;
2430 
2431    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
2432    case nir_intrinsic_load_simd_width_intel:
2433    case nir_intrinsic_load_subgroup_id:
2434       return true;
2435 
2436    default:
2437       return false;
2438    }
2439 }
2440 
2441 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)2442 lower_simd(nir_builder *b, nir_instr *instr, void *options)
2443 {
2444    uintptr_t simd_width = (uintptr_t)options;
2445 
2446    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
2447    case nir_intrinsic_load_simd_width_intel:
2448       return nir_imm_int(b, simd_width);
2449 
2450    case nir_intrinsic_load_subgroup_id:
2451       /* If the whole workgroup fits in one thread, we can lower subgroup_id
2452        * to a constant zero.
2453        */
2454       if (!b->shader->info.workgroup_size_variable) {
2455          unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
2456                                          b->shader->info.workgroup_size[1] *
2457                                          b->shader->info.workgroup_size[2];
2458          if (local_workgroup_size <= simd_width)
2459             return nir_imm_int(b, 0);
2460       }
2461       return NULL;
2462 
2463    default:
2464       return NULL;
2465    }
2466 }
2467 
2468 bool
brw_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)2469 brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
2470 {
2471    return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
2472                                  (void *)(uintptr_t)dispatch_width);
2473 }
2474 
2475 
2476