• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Connor Abbott (cwabbott0@gmail.com)
25  *
26  */
27 
28 #include "nir.h"
29 #include "nir_builder.h"
30 #include "util/u_math.h"
31 #include "util/set.h"
32 
33 struct lower_sysval_state {
34    const nir_lower_compute_system_values_options *options;
35 
36    /* List of intrinsics that have already been lowered and shouldn't be
37     * lowered again.
38     */
39    struct set *lower_once_list;
40 };
41 
42 static nir_ssa_def *
sanitize_32bit_sysval(nir_builder * b,nir_intrinsic_instr * intrin)43 sanitize_32bit_sysval(nir_builder *b, nir_intrinsic_instr *intrin)
44 {
45    assert(intrin->dest.is_ssa);
46    const unsigned bit_size = intrin->dest.ssa.bit_size;
47    if (bit_size == 32)
48       return NULL;
49 
50    intrin->dest.ssa.bit_size = 32;
51    return nir_u2u(b, &intrin->dest.ssa, bit_size);
52 }
53 
54 static nir_ssa_def*
build_global_group_size(nir_builder * b,unsigned bit_size)55 build_global_group_size(nir_builder *b, unsigned bit_size)
56 {
57    nir_ssa_def *group_size = nir_load_workgroup_size(b);
58    nir_ssa_def *num_workgroups = nir_load_num_workgroups(b, bit_size);
59    return nir_imul(b, nir_u2u(b, group_size, bit_size),
60                       num_workgroups);
61 }
62 
63 static bool
lower_system_value_filter(const nir_instr * instr,const void * _state)64 lower_system_value_filter(const nir_instr *instr, const void *_state)
65 {
66    return instr->type == nir_instr_type_intrinsic;
67 }
68 
69 static nir_ssa_def *
lower_system_value_instr(nir_builder * b,nir_instr * instr,void * _state)70 lower_system_value_instr(nir_builder *b, nir_instr *instr, void *_state)
71 {
72    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
73 
74    /* All the intrinsics we care about are loads */
75    if (!nir_intrinsic_infos[intrin->intrinsic].has_dest)
76       return NULL;
77 
78    assert(intrin->dest.is_ssa);
79    const unsigned bit_size = intrin->dest.ssa.bit_size;
80 
81    switch (intrin->intrinsic) {
82    case nir_intrinsic_load_vertex_id:
83       if (b->shader->options->vertex_id_zero_based) {
84          return nir_iadd(b, nir_load_vertex_id_zero_base(b),
85                             nir_load_first_vertex(b));
86       } else {
87          return NULL;
88       }
89 
90    case nir_intrinsic_load_base_vertex:
91       /**
92        * From the OpenGL 4.6 (11.1.3.9 Shader Inputs) specification:
93        *
94        * "gl_BaseVertex holds the integer value passed to the baseVertex
95        * parameter to the command that resulted in the current shader
96        * invocation. In the case where the command has no baseVertex
97        * parameter, the value of gl_BaseVertex is zero."
98        */
99       if (b->shader->options->lower_base_vertex) {
100          return nir_iand(b, nir_load_is_indexed_draw(b),
101                             nir_load_first_vertex(b));
102       } else {
103          return NULL;
104       }
105 
106    case nir_intrinsic_load_helper_invocation:
107       if (b->shader->options->lower_helper_invocation) {
108          nir_ssa_def *tmp;
109          tmp = nir_ishl(b, nir_imm_int(b, 1),
110                            nir_load_sample_id_no_per_sample(b));
111          tmp = nir_iand(b, nir_load_sample_mask_in(b), tmp);
112          return nir_inot(b, nir_i2b(b, tmp));
113       } else {
114          return NULL;
115       }
116 
117    case nir_intrinsic_load_local_invocation_id:
118    case nir_intrinsic_load_local_invocation_index:
119    case nir_intrinsic_load_workgroup_size:
120       return sanitize_32bit_sysval(b, intrin);
121 
122    case nir_intrinsic_load_deref: {
123       nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
124       if (!nir_deref_mode_is(deref, nir_var_system_value))
125          return NULL;
126 
127       nir_ssa_def *column = NULL;
128       if (deref->deref_type != nir_deref_type_var) {
129          /* The only one system values that aren't plane variables are
130           * gl_SampleMask which is always an array of one element and a
131           * couple of ray-tracing intrinsics which are matrices.
132           */
133          assert(deref->deref_type == nir_deref_type_array);
134          assert(deref->arr.index.is_ssa);
135          column = deref->arr.index.ssa;
136          deref = nir_deref_instr_parent(deref);
137          assert(deref->deref_type == nir_deref_type_var);
138          assert(deref->var->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN ||
139                 deref->var->data.location == SYSTEM_VALUE_RAY_OBJECT_TO_WORLD ||
140                 deref->var->data.location == SYSTEM_VALUE_RAY_WORLD_TO_OBJECT);
141       }
142       nir_variable *var = deref->var;
143 
144       switch (var->data.location) {
145       case SYSTEM_VALUE_INSTANCE_INDEX:
146          return nir_iadd(b, nir_load_instance_id(b),
147                             nir_load_base_instance(b));
148 
149       case SYSTEM_VALUE_SUBGROUP_EQ_MASK:
150       case SYSTEM_VALUE_SUBGROUP_GE_MASK:
151       case SYSTEM_VALUE_SUBGROUP_GT_MASK:
152       case SYSTEM_VALUE_SUBGROUP_LE_MASK:
153       case SYSTEM_VALUE_SUBGROUP_LT_MASK: {
154          nir_intrinsic_op op =
155             nir_intrinsic_from_system_value(var->data.location);
156          nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op);
157          nir_ssa_dest_init_for_type(&load->instr, &load->dest,
158                                     var->type, NULL);
159          load->num_components = load->dest.ssa.num_components;
160          nir_builder_instr_insert(b, &load->instr);
161          return &load->dest.ssa;
162       }
163 
164       case SYSTEM_VALUE_DEVICE_INDEX:
165          if (b->shader->options->lower_device_index_to_zero)
166             return nir_imm_int(b, 0);
167          break;
168 
169       case SYSTEM_VALUE_GLOBAL_GROUP_SIZE:
170          return build_global_group_size(b, bit_size);
171 
172       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL:
173          return nir_load_barycentric(b, nir_intrinsic_load_barycentric_pixel,
174                                      INTERP_MODE_NOPERSPECTIVE);
175 
176       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
177          return nir_load_barycentric(b, nir_intrinsic_load_barycentric_centroid,
178                                      INTERP_MODE_NOPERSPECTIVE);
179 
180       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
181          return nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample,
182                                      INTERP_MODE_NOPERSPECTIVE);
183 
184       case SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL:
185          return nir_load_barycentric(b, nir_intrinsic_load_barycentric_pixel,
186                                      INTERP_MODE_SMOOTH);
187 
188       case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
189          return nir_load_barycentric(b, nir_intrinsic_load_barycentric_centroid,
190                                      INTERP_MODE_SMOOTH);
191 
192       case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
193          return nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample,
194                                      INTERP_MODE_SMOOTH);
195 
196       case SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL:
197          return nir_load_barycentric(b, nir_intrinsic_load_barycentric_model,
198                                      INTERP_MODE_NONE);
199 
200       default:
201          break;
202       }
203 
204       nir_intrinsic_op sysval_op =
205          nir_intrinsic_from_system_value(var->data.location);
206       if (glsl_type_is_matrix(var->type)) {
207          assert(nir_intrinsic_infos[sysval_op].index_map[NIR_INTRINSIC_COLUMN] > 0);
208          unsigned num_cols = glsl_get_matrix_columns(var->type);
209          ASSERTED unsigned num_rows = glsl_get_vector_elements(var->type);
210          assert(num_rows == intrin->dest.ssa.num_components);
211 
212          nir_ssa_def *cols[4];
213          for (unsigned i = 0; i < num_cols; i++) {
214             cols[i] = nir_load_system_value(b, sysval_op, i,
215                                             intrin->dest.ssa.num_components,
216                                             intrin->dest.ssa.bit_size);
217             assert(cols[i]->num_components == num_rows);
218          }
219          return nir_select_from_ssa_def_array(b, cols, num_cols, column);
220       } else {
221          return nir_load_system_value(b, sysval_op, 0,
222                                       intrin->dest.ssa.num_components,
223                                       intrin->dest.ssa.bit_size);
224       }
225    }
226 
227    default:
228       return NULL;
229    }
230 }
231 
232 bool
nir_lower_system_values(nir_shader * shader)233 nir_lower_system_values(nir_shader *shader)
234 {
235    bool progress = nir_shader_lower_instructions(shader,
236                                                  lower_system_value_filter,
237                                                  lower_system_value_instr,
238                                                  NULL);
239 
240    /* We're going to delete the variables so we need to clean up all those
241     * derefs we left lying around.
242     */
243    if (progress)
244       nir_remove_dead_derefs(shader);
245 
246    nir_foreach_variable_with_modes_safe(var, shader, nir_var_system_value)
247       exec_node_remove(&var->node);
248 
249    return progress;
250 }
251 
252 static bool
lower_compute_system_value_filter(const nir_instr * instr,const void * _state)253 lower_compute_system_value_filter(const nir_instr *instr, const void *_state)
254 {
255    return instr->type == nir_instr_type_intrinsic;
256 }
257 
258 static nir_ssa_def *
lower_compute_system_value_instr(nir_builder * b,nir_instr * instr,void * _state)259 lower_compute_system_value_instr(nir_builder *b,
260                                  nir_instr *instr, void *_state)
261 {
262    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
263    struct lower_sysval_state *state = (struct lower_sysval_state *)_state;
264    const nir_lower_compute_system_values_options *options = state->options;
265 
266    /* All the intrinsics we care about are loads */
267    if (!nir_intrinsic_infos[intrin->intrinsic].has_dest)
268       return NULL;
269 
270    assert(intrin->dest.is_ssa);
271    const unsigned bit_size = intrin->dest.ssa.bit_size;
272 
273    switch (intrin->intrinsic) {
274    case nir_intrinsic_load_local_invocation_id:
275       /* If lower_cs_local_id_from_index is true, then we derive the local
276        * index from the local id.
277        */
278       if (b->shader->options->lower_cs_local_id_from_index) {
279          /* We lower gl_LocalInvocationID from gl_LocalInvocationIndex based
280           * on this formula:
281           *
282           *    gl_LocalInvocationID.x =
283           *       gl_LocalInvocationIndex % gl_WorkGroupSize.x;
284           *    gl_LocalInvocationID.y =
285           *       (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
286           *       gl_WorkGroupSize.y;
287           *    gl_LocalInvocationID.z =
288           *       (gl_LocalInvocationIndex /
289           *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
290           *       gl_WorkGroupSize.z;
291           *
292           * However, the final % gl_WorkGroupSize.z does nothing unless we
293           * accidentally end up with a gl_LocalInvocationIndex that is too
294           * large so it can safely be omitted.
295           */
296          nir_ssa_def *local_index = nir_load_local_invocation_index(b);
297          nir_ssa_def *local_size = nir_load_workgroup_size(b);
298 
299          /* Because no hardware supports a local workgroup size greater than
300           * about 1K, this calculation can be done in 32-bit and can save some
301           * 64-bit arithmetic.
302           */
303          nir_ssa_def *id_x, *id_y, *id_z;
304          id_x = nir_umod(b, local_index,
305                             nir_channel(b, local_size, 0));
306          id_y = nir_umod(b, nir_udiv(b, local_index,
307                                         nir_channel(b, local_size, 0)),
308                             nir_channel(b, local_size, 1));
309          id_z = nir_udiv(b, local_index,
310                             nir_imul(b, nir_channel(b, local_size, 0),
311                                         nir_channel(b, local_size, 1)));
312          return nir_u2u(b, nir_vec3(b, id_x, id_y, id_z), bit_size);
313       }
314       if (options && options->shuffle_local_ids_for_quad_derivatives &&
315           b->shader->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS &&
316           _mesa_set_search(state->lower_once_list, instr) == NULL) {
317          nir_ssa_def *ids = nir_load_local_invocation_id(b);
318          _mesa_set_add(state->lower_once_list, ids->parent_instr);
319 
320          nir_ssa_def *x = nir_channel(b, ids, 0);
321          nir_ssa_def *y = nir_channel(b, ids, 1);
322          nir_ssa_def *z = nir_channel(b, ids, 2);
323          unsigned size_x = b->shader->info.workgroup_size[0];
324          nir_ssa_def *size_x_imm;
325 
326          if (b->shader->info.workgroup_size_variable)
327             size_x_imm = nir_channel(b, nir_load_workgroup_size(b), 0);
328          else
329             size_x_imm = nir_imm_int(b, size_x);
330 
331          /* Remap indices from:
332           *    | 0| 1| 2| 3|
333           *    | 4| 5| 6| 7|
334           *    | 8| 9|10|11|
335           *    |12|13|14|15|
336           * to:
337           *    | 0| 1| 4| 5|
338           *    | 2| 3| 6| 7|
339           *    | 8| 9|12|13|
340           *    |10|11|14|15|
341           *
342           * That's the layout required by AMD hardware for derivatives to
343           * work. Other hardware may work differently.
344           *
345           * It's a classic tiling pattern that can be implemented by inserting
346           * bit y[0] between bits x[0] and x[1] like this:
347           *
348           *    x[0],y[0],x[1],...x[last],y[1],...,y[last]
349           *
350           * If the width is a power of two, use:
351           *    i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) | ((y & ~1) << logbase2(size_x))
352           *
353           * If the width is not a power of two or the local size is variable, use:
354           *    i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) + ((y & ~1) * size_x)
355           *
356           * GL_NV_compute_shader_derivatives requires that the width and height
357           * are a multiple of two, which is also a requirement for the second
358           * expression to work.
359           *
360           * The 2D result is: (x,y) = (i % w, i / w)
361           */
362 
363          nir_ssa_def *one = nir_imm_int(b, 1);
364          nir_ssa_def *inv_one = nir_imm_int(b, ~1);
365          nir_ssa_def *x_bit0 = nir_iand(b, x, one);
366          nir_ssa_def *y_bit0 = nir_iand(b, y, one);
367          nir_ssa_def *x_bits_1n = nir_iand(b, x, inv_one);
368          nir_ssa_def *y_bits_1n = nir_iand(b, y, inv_one);
369          nir_ssa_def *bits_01 = nir_ior(b, x_bit0, nir_ishl(b, y_bit0, one));
370          nir_ssa_def *bits_01x = nir_ior(b, bits_01,
371                                          nir_ishl(b, x_bits_1n, one));
372          nir_ssa_def *i;
373 
374          if (!b->shader->info.workgroup_size_variable &&
375              util_is_power_of_two_nonzero(size_x)) {
376             nir_ssa_def *log2_size_x = nir_imm_int(b, util_logbase2(size_x));
377             i = nir_ior(b, bits_01x, nir_ishl(b, y_bits_1n, log2_size_x));
378          } else {
379             i = nir_iadd(b, bits_01x, nir_imul(b, y_bits_1n, size_x_imm));
380          }
381 
382          /* This should be fast if size_x is an immediate or even a power
383           * of two.
384           */
385          x = nir_umod(b, i, size_x_imm);
386          y = nir_udiv(b, i, size_x_imm);
387 
388          return nir_vec3(b, x, y, z);
389       }
390       return NULL;
391 
392    case nir_intrinsic_load_local_invocation_index:
393       /* If lower_cs_local_index_from_id is true, then we derive the local
394        * index from the local id.
395        */
396       if (b->shader->options->lower_cs_local_index_from_id ||
397           (options && options->lower_local_invocation_index)) {
398          /* From the GLSL man page for gl_LocalInvocationIndex:
399           *
400           *    "The value of gl_LocalInvocationIndex is equal to
401           *    gl_LocalInvocationID.z * gl_WorkGroupSize.x *
402           *    gl_WorkGroupSize.y + gl_LocalInvocationID.y *
403           *    gl_WorkGroupSize.x + gl_LocalInvocationID.x"
404           */
405          nir_ssa_def *local_id = nir_load_local_invocation_id(b);
406 
407          nir_ssa_def *size_x =
408             nir_imm_int(b, b->shader->info.workgroup_size[0]);
409          nir_ssa_def *size_y =
410             nir_imm_int(b, b->shader->info.workgroup_size[1]);
411 
412          /* Because no hardware supports a local workgroup size greater than
413           * about 1K, this calculation can be done in 32-bit and can save some
414           * 64-bit arithmetic.
415           */
416          nir_ssa_def *index;
417          index = nir_imul(b, nir_channel(b, local_id, 2),
418                              nir_imul(b, size_x, size_y));
419          index = nir_iadd(b, index,
420                              nir_imul(b, nir_channel(b, local_id, 1), size_x));
421          index = nir_iadd(b, index, nir_channel(b, local_id, 0));
422          return nir_u2u(b, index, bit_size);
423       } else {
424          return NULL;
425       }
426 
427    case nir_intrinsic_load_workgroup_size:
428       if (b->shader->info.workgroup_size_variable) {
429          /* If the local work group size is variable it can't be lowered at
430           * this point.  We do, however, have to make sure that the intrinsic
431           * is only 32-bit.
432           */
433          return NULL;
434       } else {
435          /* using a 32 bit constant is safe here as no device/driver needs more
436           * than 32 bits for the local size */
437          nir_const_value workgroup_size_const[3];
438          memset(workgroup_size_const, 0, sizeof(workgroup_size_const));
439          workgroup_size_const[0].u32 = b->shader->info.workgroup_size[0];
440          workgroup_size_const[1].u32 = b->shader->info.workgroup_size[1];
441          workgroup_size_const[2].u32 = b->shader->info.workgroup_size[2];
442          return nir_u2u(b, nir_build_imm(b, 3, 32, workgroup_size_const), bit_size);
443       }
444 
445    case nir_intrinsic_load_global_invocation_id_zero_base: {
446       if ((options && options->has_base_workgroup_id) ||
447           !b->shader->options->has_cs_global_id) {
448          nir_ssa_def *group_size = nir_load_workgroup_size(b);
449          nir_ssa_def *group_id = nir_load_workgroup_id(b, bit_size);
450          nir_ssa_def *local_id = nir_load_local_invocation_id(b);
451 
452          return nir_iadd(b, nir_imul(b, group_id,
453                                         nir_u2u(b, group_size, bit_size)),
454                             nir_u2u(b, local_id, bit_size));
455       } else {
456          return NULL;
457       }
458    }
459 
460    case nir_intrinsic_load_global_invocation_id: {
461       if (options && options->has_base_global_invocation_id)
462          return nir_iadd(b, nir_load_global_invocation_id_zero_base(b, bit_size),
463                             nir_load_base_global_invocation_id(b, bit_size));
464       else if ((options && options->has_base_workgroup_id) ||
465                !b->shader->options->has_cs_global_id)
466          return nir_load_global_invocation_id_zero_base(b, bit_size);
467       else
468          return NULL;
469    }
470 
471    case nir_intrinsic_load_global_invocation_index: {
472       /* OpenCL's global_linear_id explicitly removes the global offset before computing this */
473       assert(b->shader->info.stage == MESA_SHADER_KERNEL);
474       nir_ssa_def *global_base_id = nir_load_base_global_invocation_id(b, bit_size);
475       nir_ssa_def *global_id = nir_isub(b, nir_load_global_invocation_id(b, bit_size), global_base_id);
476       nir_ssa_def *global_size = build_global_group_size(b, bit_size);
477 
478       /* index = id.x + ((id.y + (id.z * size.y)) * size.x) */
479       nir_ssa_def *index;
480       index = nir_imul(b, nir_channel(b, global_id, 2),
481                           nir_channel(b, global_size, 1));
482       index = nir_iadd(b, nir_channel(b, global_id, 1), index);
483       index = nir_imul(b, nir_channel(b, global_size, 0), index);
484       index = nir_iadd(b, nir_channel(b, global_id, 0), index);
485       return index;
486    }
487 
488    case nir_intrinsic_load_workgroup_id: {
489       if (options && options->has_base_workgroup_id)
490          return nir_iadd(b, nir_u2u(b, nir_load_workgroup_id_zero_base(b), bit_size),
491                             nir_load_base_workgroup_id(b, bit_size));
492       else
493          return NULL;
494    }
495 
496    default:
497       return NULL;
498    }
499 }
500 
501 bool
nir_lower_compute_system_values(nir_shader * shader,const nir_lower_compute_system_values_options * options)502 nir_lower_compute_system_values(nir_shader *shader,
503                                 const nir_lower_compute_system_values_options *options)
504 {
505    if (!gl_shader_stage_uses_workgroup(shader->info.stage))
506       return false;
507 
508    struct lower_sysval_state state;
509    state.options = options;
510    state.lower_once_list = _mesa_pointer_set_create(NULL);
511 
512    bool progress =
513       nir_shader_lower_instructions(shader,
514                                     lower_compute_system_value_filter,
515                                     lower_compute_system_value_instr,
516                                     (void*)&state);
517    ralloc_free(state.lower_once_list);
518 
519    /* Update this so as not to lower it again. */
520    if (options && options->shuffle_local_ids_for_quad_derivatives &&
521        shader->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS)
522       shader->info.cs.derivative_group = DERIVATIVE_GROUP_LINEAR;
523 
524    return progress;
525 }
526