• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "util/u_math.h"
27 
28 /**
29  * \file nir_opt_intrinsics.c
30  */
31 
32 static nir_intrinsic_instr *
lower_subgroups_64bit_split_intrinsic(nir_builder * b,nir_intrinsic_instr * intrin,unsigned int component)33 lower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
34                                       unsigned int component)
35 {
36    nir_ssa_def *comp;
37    if (component == 0)
38       comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa);
39    else
40       comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa);
41 
42    nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
43    nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL);
44    intr->const_index[0] = intrin->const_index[0];
45    intr->const_index[1] = intrin->const_index[1];
46    intr->src[0] = nir_src_for_ssa(comp);
47    if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2)
48       nir_src_copy(&intr->src[1], &intrin->src[1]);
49 
50    intr->num_components = 1;
51    nir_builder_instr_insert(b, &intr->instr);
52    return intr;
53 }
54 
55 static nir_ssa_def *
lower_subgroup_op_to_32bit(nir_builder * b,nir_intrinsic_instr * intrin)56 lower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin)
57 {
58    assert(intrin->src[0].ssa->bit_size == 64);
59    nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0);
60    nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1);
61    return nir_pack_64_2x32_split(b, &intr_x->dest.ssa, &intr_y->dest.ssa);
62 }
63 
64 static nir_ssa_def *
ballot_type_to_uint(nir_builder * b,nir_ssa_def * value,const nir_lower_subgroups_options * options)65 ballot_type_to_uint(nir_builder *b, nir_ssa_def *value,
66                     const nir_lower_subgroups_options *options)
67 {
68    /* Only the new-style SPIR-V subgroup instructions take a ballot result as
69     * an argument, so we only use this on uvec4 types.
70     */
71    assert(value->num_components == 4 && value->bit_size == 32);
72 
73    return nir_extract_bits(b, &value, 1, 0, options->ballot_components,
74                            options->ballot_bit_size);
75 }
76 
77 static nir_ssa_def *
uint_to_ballot_type(nir_builder * b,nir_ssa_def * value,unsigned num_components,unsigned bit_size)78 uint_to_ballot_type(nir_builder *b, nir_ssa_def *value,
79                     unsigned num_components, unsigned bit_size)
80 {
81    assert(util_is_power_of_two_nonzero(num_components));
82    assert(util_is_power_of_two_nonzero(value->num_components));
83 
84    unsigned total_bits = bit_size * num_components;
85 
86    /* If the source doesn't have enough bits, zero-pad */
87    if (total_bits > value->bit_size * value->num_components)
88       value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size);
89 
90    value = nir_bitcast_vector(b, value, bit_size);
91 
92    /* If the source has too many components, truncate.  This can happen if,
93     * for instance, we're implementing GL_ARB_shader_ballot or
94     * VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an
95     * architecture with a native 128-bit uvec4 ballot.  This comes up in Zink
96     * for OpenGL on Vulkan.  It's the job of the driver calling this lowering
97     * pass to ensure that it's restricted subgroup sizes sufficiently that we
98     * have enough ballot bits.
99     */
100    if (value->num_components > num_components)
101       value = nir_trim_vector(b, value, num_components);
102 
103    return value;
104 }
105 
106 static nir_ssa_def *
lower_subgroup_op_to_scalar(nir_builder * b,nir_intrinsic_instr * intrin,bool lower_to_32bit)107 lower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin,
108                             bool lower_to_32bit)
109 {
110    /* This is safe to call on scalar things but it would be silly */
111    assert(intrin->dest.ssa.num_components > 1);
112 
113    nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0],
114                                            intrin->num_components);
115    nir_ssa_def *reads[NIR_MAX_VEC_COMPONENTS];
116 
117    for (unsigned i = 0; i < intrin->num_components; i++) {
118       nir_intrinsic_instr *chan_intrin =
119          nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
120       nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
121                         1, intrin->dest.ssa.bit_size, NULL);
122       chan_intrin->num_components = 1;
123 
124       /* value */
125       chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
126       /* invocation */
127       if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) {
128          assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2);
129          nir_src_copy(&chan_intrin->src[1], &intrin->src[1]);
130       }
131 
132       chan_intrin->const_index[0] = intrin->const_index[0];
133       chan_intrin->const_index[1] = intrin->const_index[1];
134 
135       if (lower_to_32bit && chan_intrin->src[0].ssa->bit_size == 64) {
136          reads[i] = lower_subgroup_op_to_32bit(b, chan_intrin);
137       } else {
138          nir_builder_instr_insert(b, &chan_intrin->instr);
139          reads[i] = &chan_intrin->dest.ssa;
140       }
141    }
142 
143    return nir_vec(b, reads, intrin->num_components);
144 }
145 
146 static nir_ssa_def *
lower_vote_eq_to_scalar(nir_builder * b,nir_intrinsic_instr * intrin)147 lower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
148 {
149    assert(intrin->src[0].is_ssa);
150    nir_ssa_def *value = intrin->src[0].ssa;
151 
152    nir_ssa_def *result = NULL;
153    for (unsigned i = 0; i < intrin->num_components; i++) {
154       nir_intrinsic_instr *chan_intrin =
155          nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
156       nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
157                         1, intrin->dest.ssa.bit_size, NULL);
158       chan_intrin->num_components = 1;
159       chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
160       nir_builder_instr_insert(b, &chan_intrin->instr);
161 
162       if (result) {
163          result = nir_iand(b, result, &chan_intrin->dest.ssa);
164       } else {
165          result = &chan_intrin->dest.ssa;
166       }
167    }
168 
169    return result;
170 }
171 
172 static nir_ssa_def *
lower_vote_eq(nir_builder * b,nir_intrinsic_instr * intrin)173 lower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin)
174 {
175    assert(intrin->src[0].is_ssa);
176    nir_ssa_def *value = intrin->src[0].ssa;
177 
178    /* We have to implicitly lower to scalar */
179    nir_ssa_def *all_eq = NULL;
180    for (unsigned i = 0; i < intrin->num_components; i++) {
181       nir_ssa_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i));
182 
183       nir_ssa_def *is_eq;
184       if (intrin->intrinsic == nir_intrinsic_vote_feq) {
185          is_eq = nir_feq(b, rfi, nir_channel(b, value, i));
186       } else {
187          is_eq = nir_ieq(b, rfi, nir_channel(b, value, i));
188       }
189 
190       if (all_eq == NULL) {
191          all_eq = is_eq;
192       } else {
193          all_eq = nir_iand(b, all_eq, is_eq);
194       }
195    }
196 
197    return nir_vote_all(b, 1, all_eq);
198 }
199 
200 static nir_ssa_def *
lower_shuffle_to_swizzle(nir_builder * b,nir_intrinsic_instr * intrin,const nir_lower_subgroups_options * options)201 lower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin,
202                          const nir_lower_subgroups_options *options)
203 {
204    unsigned mask = nir_src_as_uint(intrin->src[1]);
205 
206    if (mask >= 32)
207       return NULL;
208 
209    nir_intrinsic_instr *swizzle = nir_intrinsic_instr_create(
210       b->shader, nir_intrinsic_masked_swizzle_amd);
211    swizzle->num_components = intrin->num_components;
212    nir_src_copy(&swizzle->src[0], &intrin->src[0]);
213    nir_intrinsic_set_swizzle_mask(swizzle, (mask << 10) | 0x1f);
214    nir_ssa_dest_init(&swizzle->instr, &swizzle->dest,
215                      intrin->dest.ssa.num_components,
216                      intrin->dest.ssa.bit_size, NULL);
217 
218    if (options->lower_to_scalar && swizzle->num_components > 1) {
219       return lower_subgroup_op_to_scalar(b, swizzle, options->lower_shuffle_to_32bit);
220    } else if (options->lower_shuffle_to_32bit && swizzle->src[0].ssa->bit_size == 64) {
221       return lower_subgroup_op_to_32bit(b, swizzle);
222    } else {
223       nir_builder_instr_insert(b, &swizzle->instr);
224       return &swizzle->dest.ssa;
225    }
226 }
227 
228 /* Lowers "specialized" shuffles to a generic nir_intrinsic_shuffle. */
229 
230 static nir_ssa_def *
lower_to_shuffle(nir_builder * b,nir_intrinsic_instr * intrin,const nir_lower_subgroups_options * options)231 lower_to_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
232                  const nir_lower_subgroups_options *options)
233 {
234    if (intrin->intrinsic == nir_intrinsic_shuffle_xor &&
235        options->lower_shuffle_to_swizzle_amd &&
236        nir_src_is_const(intrin->src[1])) {
237       nir_ssa_def *result =
238          lower_shuffle_to_swizzle(b, intrin, options);
239       if (result)
240          return result;
241    }
242 
243    nir_ssa_def *index = nir_load_subgroup_invocation(b);
244    bool is_shuffle = false;
245    switch (intrin->intrinsic) {
246    case nir_intrinsic_shuffle_xor:
247       assert(intrin->src[1].is_ssa);
248       index = nir_ixor(b, index, intrin->src[1].ssa);
249       is_shuffle = true;
250       break;
251    case nir_intrinsic_shuffle_up:
252       assert(intrin->src[1].is_ssa);
253       index = nir_isub(b, index, intrin->src[1].ssa);
254       is_shuffle = true;
255       break;
256    case nir_intrinsic_shuffle_down:
257       assert(intrin->src[1].is_ssa);
258       index = nir_iadd(b, index, intrin->src[1].ssa);
259       is_shuffle = true;
260       break;
261    case nir_intrinsic_quad_broadcast:
262       assert(intrin->src[1].is_ssa);
263       index = nir_ior(b, nir_iand(b, index, nir_imm_int(b, ~0x3)),
264                          intrin->src[1].ssa);
265       break;
266    case nir_intrinsic_quad_swap_horizontal:
267       /* For Quad operations, subgroups are divided into quads where
268        * (invocation % 4) is the index to a square arranged as follows:
269        *
270        *    +---+---+
271        *    | 0 | 1 |
272        *    +---+---+
273        *    | 2 | 3 |
274        *    +---+---+
275        */
276       index = nir_ixor(b, index, nir_imm_int(b, 0x1));
277       break;
278    case nir_intrinsic_quad_swap_vertical:
279       index = nir_ixor(b, index, nir_imm_int(b, 0x2));
280       break;
281    case nir_intrinsic_quad_swap_diagonal:
282       index = nir_ixor(b, index, nir_imm_int(b, 0x3));
283       break;
284    default:
285       unreachable("Invalid intrinsic");
286    }
287 
288    nir_intrinsic_instr *shuffle =
289       nir_intrinsic_instr_create(b->shader, nir_intrinsic_shuffle);
290    shuffle->num_components = intrin->num_components;
291    nir_src_copy(&shuffle->src[0], &intrin->src[0]);
292    shuffle->src[1] = nir_src_for_ssa(index);
293    nir_ssa_dest_init(&shuffle->instr, &shuffle->dest,
294                      intrin->dest.ssa.num_components,
295                      intrin->dest.ssa.bit_size, NULL);
296 
297    bool lower_to_32bit = options->lower_shuffle_to_32bit && is_shuffle;
298    if (options->lower_to_scalar && shuffle->num_components > 1) {
299       return lower_subgroup_op_to_scalar(b, shuffle, lower_to_32bit);
300    } else if (lower_to_32bit && shuffle->src[0].ssa->bit_size == 64) {
301       return lower_subgroup_op_to_32bit(b, shuffle);
302    } else {
303       nir_builder_instr_insert(b, &shuffle->instr);
304       return &shuffle->dest.ssa;
305    }
306 }
307 
308 static const struct glsl_type *
glsl_type_for_ssa(nir_ssa_def * def)309 glsl_type_for_ssa(nir_ssa_def *def)
310 {
311    const struct glsl_type *comp_type = def->bit_size == 1 ? glsl_bool_type() :
312       glsl_uintN_t_type(def->bit_size);
313    return glsl_replace_vector_type(comp_type, def->num_components);
314 }
315 
316 /* Lower nir_intrinsic_shuffle to a waterfall loop + nir_read_invocation.
317  */
318 static nir_ssa_def *
lower_shuffle(nir_builder * b,nir_intrinsic_instr * intrin)319 lower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin)
320 {
321    assert(intrin->src[0].is_ssa);
322    assert(intrin->src[1].is_ssa);
323    nir_ssa_def *val = intrin->src[0].ssa;
324    nir_ssa_def *id = intrin->src[1].ssa;
325 
326    /* The loop is something like:
327     *
328     * while (true) {
329     *    first_id = readFirstInvocation(gl_SubgroupInvocationID);
330     *    first_val = readFirstInvocation(val);
331     *    first_result = readInvocation(val, readFirstInvocation(id));
332     *    if (id == first_id)
333     *       result = first_val;
334     *    if (elect()) {
335     *       if (id > gl_SubgroupInvocationID) {
336     *          result = first_result;
337     *       }
338     *       break;
339     *    }
340     * }
341     *
342     * The idea is to guarantee, on each iteration of the loop, that anything
343     * reading from first_id gets the correct value, so that we can then kill
344     * it off by breaking out of the loop. Before doing that we also have to
345     * ensure that first_id invocation gets the correct value. It only won't be
346     * assigned the correct value already if the invocation it's reading from
347     * isn't already killed off, that is, if it's later than its own ID.
348     * Invocations where id <= gl_SubgroupInvocationID will be assigned their
349     * result in the first if, and invocations where id >
350     * gl_SubgroupInvocationID will be assigned their result in the second if.
351     *
352     * We do this more complicated loop rather than looping over all id's
353     * explicitly because at this point we don't know the "actual" subgroup
354     * size and at the moment there's no way to get at it, which means we may
355     * loop over always-inactive invocations.
356     */
357 
358    nir_ssa_def *subgroup_id = nir_load_subgroup_invocation(b);
359 
360    nir_variable *result =
361       nir_local_variable_create(b->impl, glsl_type_for_ssa(val), "result");
362 
363    nir_loop *loop = nir_push_loop(b); {
364       nir_ssa_def *first_id = nir_read_first_invocation(b, subgroup_id);
365       nir_ssa_def *first_val = nir_read_first_invocation(b, val);
366       nir_ssa_def *first_result =
367          nir_read_invocation(b, val, nir_read_first_invocation(b, id));
368 
369       nir_if *nif = nir_push_if(b, nir_ieq(b, id, first_id)); {
370          nir_store_var(b, result, first_val, BITFIELD_MASK(val->num_components));
371       } nir_pop_if(b, nif);
372 
373       nir_if *nif2 = nir_push_if(b, nir_elect(b, 1)); {
374          nir_if *nif3 = nir_push_if(b, nir_ult(b, subgroup_id, id)); {
375             nir_store_var(b, result, first_result, BITFIELD_MASK(val->num_components));
376          } nir_pop_if(b, nif3);
377 
378          nir_jump(b, nir_jump_break);
379       } nir_pop_if(b, nif2);
380    } nir_pop_loop(b, loop);
381 
382    return nir_load_var(b, result);
383 }
384 
385 static bool
lower_subgroups_filter(const nir_instr * instr,const void * _options)386 lower_subgroups_filter(const nir_instr *instr, const void *_options)
387 {
388    return instr->type == nir_instr_type_intrinsic;
389 }
390 
391 /* Return a ballot-mask-sized value which represents "val" sign-extended and
392  * then shifted left by "shift". Only particular values for "val" are
393  * supported, see below.
394  */
395 static nir_ssa_def *
build_ballot_imm_ishl(nir_builder * b,int64_t val,nir_ssa_def * shift,const nir_lower_subgroups_options * options)396 build_ballot_imm_ishl(nir_builder *b, int64_t val, nir_ssa_def *shift,
397                       const nir_lower_subgroups_options *options)
398 {
399    /* This only works if all the high bits are the same as bit 1. */
400    assert((val >> 2) == (val & 0x2 ? -1 : 0));
401 
402    /* First compute the result assuming one ballot component. */
403    nir_ssa_def *result =
404       nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift);
405 
406    if (options->ballot_components == 1)
407       return result;
408 
409    /* Fix up the result when there is > 1 component. The idea is that nir_ishl
410     * masks out the high bits of the shift value already, so in case there's
411     * more than one component the component which 1 would be shifted into
412     * already has the right value and all we have to do is fixup the other
413     * components. Components below it should always be 0, and components above
414     * it must be either 0 or ~0 because of the assert above. For example, if
415     * the target ballot size is 2 x uint32, and we're shifting 1 by 33, then
416     * we'll feed 33 into ishl, which will mask it off to get 1, so we'll
417     * compute a single-component result of 2, which is correct for the second
418     * component, but the first component needs to be 0, which we get by
419     * comparing the high bits of the shift with 0 and selecting the original
420     * answer or 0 for the first component (and something similar with the
421     * second component). This idea is generalized here for any component count
422     */
423    nir_const_value min_shift[4] = { 0 };
424    for (unsigned i = 0; i < options->ballot_components; i++)
425       min_shift[i].i32 = i * options->ballot_bit_size;
426    nir_ssa_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift);
427 
428    nir_const_value max_shift[4] = { 0 };
429    for (unsigned i = 0; i < options->ballot_components; i++)
430       max_shift[i].i32 = (i + 1) * options->ballot_bit_size;
431    nir_ssa_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift);
432 
433    return nir_bcsel(b, nir_ult(b, shift, max_shift_val),
434                     nir_bcsel(b, nir_ult(b, shift, min_shift_val),
435                               nir_imm_intN_t(b, val >> 63, result->bit_size),
436                               result),
437                     nir_imm_intN_t(b, 0, result->bit_size));
438 }
439 
440 static nir_ssa_def *
build_subgroup_eq_mask(nir_builder * b,const nir_lower_subgroups_options * options)441 build_subgroup_eq_mask(nir_builder *b,
442                        const nir_lower_subgroups_options *options)
443 {
444    nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
445 
446    return build_ballot_imm_ishl(b, 1, subgroup_idx, options);
447 }
448 
449 static nir_ssa_def *
build_subgroup_ge_mask(nir_builder * b,const nir_lower_subgroups_options * options)450 build_subgroup_ge_mask(nir_builder *b,
451                        const nir_lower_subgroups_options *options)
452 {
453    nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
454 
455    return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options);
456 }
457 
458 static nir_ssa_def *
build_subgroup_gt_mask(nir_builder * b,const nir_lower_subgroups_options * options)459 build_subgroup_gt_mask(nir_builder *b,
460                        const nir_lower_subgroups_options *options)
461 {
462    nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
463 
464    return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options);
465 }
466 
467 /* Return a mask which is 1 for threads up to the run-time subgroup size, i.e.
468  * 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or
469  * above the subgroup size for the masks, but gt_mask and ge_mask make them 1
470  * so we have to "and" with this mask.
471  */
472 static nir_ssa_def *
build_subgroup_mask(nir_builder * b,const nir_lower_subgroups_options * options)473 build_subgroup_mask(nir_builder *b,
474                     const nir_lower_subgroups_options *options)
475 {
476    nir_ssa_def *subgroup_size = nir_load_subgroup_size(b);
477 
478    /* First compute the result assuming one ballot component. */
479    nir_ssa_def *result =
480       nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size),
481                   nir_isub_imm(b, options->ballot_bit_size,
482                                subgroup_size));
483 
484    /* Since the subgroup size and ballot bitsize are both powers of two, there
485     * are two possible cases to consider:
486     *
487     * (1) The subgroup size is less than the ballot bitsize. We need to return
488     * "result" in the first component and 0 in every other component.
489     * (2) The subgroup size is a multiple of the ballot bitsize. We need to
490     * return ~0 if the subgroup size divided by the ballot bitsize is less
491     * than or equal to the index in the vector and 0 otherwise. For example,
492     * with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need
493     * to return { ~0, ~0, 0, 0 }.
494     *
495     * In case (2) it turns out that "result" will be ~0, because
496     * "ballot_bit_size - subgroup_size" is also a multiple of
497     * "ballot_bit_size" and since nir_ushr masks the shift value it will
498     * shifted by 0. This means that the first component can just be "result"
499     * in all cases.  The other components will also get the correct value in
500     * case (1) if we just use the rule in case (2), so we'll get the correct
501     * result if we just follow (2) and then replace the first component with
502     * "result".
503     */
504    nir_const_value min_idx[4] = { 0 };
505    for (unsigned i = 0; i < options->ballot_components; i++)
506       min_idx[i].i32 = i * options->ballot_bit_size;
507    nir_ssa_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx);
508 
509    nir_ssa_def *result_extended =
510       nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components);
511 
512    return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size),
513                     result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size));
514 }
515 
516 static nir_ssa_def *
vec_bit_count(nir_builder * b,nir_ssa_def * value)517 vec_bit_count(nir_builder *b, nir_ssa_def *value)
518 {
519    nir_ssa_def *vec_result = nir_bit_count(b, value);
520    nir_ssa_def *result = nir_channel(b, vec_result, 0);
521    for (unsigned i = 1; i < value->num_components; i++)
522       result = nir_iadd(b, result, nir_channel(b, vec_result, i));
523    return result;
524 }
525 
526 static nir_ssa_def *
vec_find_lsb(nir_builder * b,nir_ssa_def * value)527 vec_find_lsb(nir_builder *b, nir_ssa_def *value)
528 {
529    nir_ssa_def *vec_result = nir_find_lsb(b, value);
530    nir_ssa_def *result = nir_imm_int(b, -1);
531    for (int i = value->num_components - 1; i >= 0; i--) {
532       nir_ssa_def *channel = nir_channel(b, vec_result, i);
533       /* result = channel >= 0 ? (i * bitsize + channel) : result */
534       result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)),
535                          nir_iadd_imm(b, channel, i * value->bit_size),
536                          result);
537    }
538    return result;
539 }
540 
541 static nir_ssa_def *
vec_find_msb(nir_builder * b,nir_ssa_def * value)542 vec_find_msb(nir_builder *b, nir_ssa_def *value)
543 {
544    nir_ssa_def *vec_result = nir_ufind_msb(b, value);
545    nir_ssa_def *result = nir_imm_int(b, -1);
546    for (unsigned i = 0; i < value->num_components; i++) {
547       nir_ssa_def *channel = nir_channel(b, vec_result, i);
548       /* result = channel >= 0 ? (i * bitsize + channel) : result */
549       result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)),
550                          nir_iadd_imm(b, channel, i * value->bit_size),
551                          result);
552    }
553    return result;
554 }
555 
556 static nir_ssa_def *
lower_dynamic_quad_broadcast(nir_builder * b,nir_intrinsic_instr * intrin,const nir_lower_subgroups_options * options)557 lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin,
558                              const nir_lower_subgroups_options *options)
559 {
560    if (!options->lower_quad_broadcast_dynamic_to_const)
561       return lower_to_shuffle(b, intrin, options);
562 
563    nir_ssa_def *dst = NULL;
564 
565    for (unsigned i = 0; i < 4; ++i) {
566       nir_intrinsic_instr *qbcst =
567          nir_intrinsic_instr_create(b->shader, nir_intrinsic_quad_broadcast);
568 
569       qbcst->num_components = intrin->num_components;
570       qbcst->src[1] = nir_src_for_ssa(nir_imm_int(b, i));
571       nir_src_copy(&qbcst->src[0], &intrin->src[0]);
572       nir_ssa_dest_init(&qbcst->instr, &qbcst->dest,
573                         intrin->dest.ssa.num_components,
574                         intrin->dest.ssa.bit_size, NULL);
575 
576       nir_ssa_def *qbcst_dst = NULL;
577 
578       if (options->lower_to_scalar && qbcst->num_components > 1) {
579          qbcst_dst = lower_subgroup_op_to_scalar(b, qbcst, false);
580       } else {
581          nir_builder_instr_insert(b, &qbcst->instr);
582          qbcst_dst = &qbcst->dest.ssa;
583       }
584 
585       if (i)
586          dst = nir_bcsel(b, nir_ieq(b, intrin->src[1].ssa,
587                                     nir_src_for_ssa(nir_imm_int(b, i)).ssa),
588                          qbcst_dst, dst);
589       else
590          dst = qbcst_dst;
591    }
592 
593    return dst;
594 }
595 
596 static nir_ssa_def *
lower_read_invocation_to_cond(nir_builder * b,nir_intrinsic_instr * intrin)597 lower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin)
598 {
599    return nir_read_invocation_cond_ir3(b, intrin->dest.ssa.bit_size,
600                                        intrin->src[0].ssa,
601                                        nir_ieq(b, intrin->src[1].ssa,
602                                                nir_load_subgroup_invocation(b)));
603 }
604 
605 static nir_ssa_def *
lower_subgroups_instr(nir_builder * b,nir_instr * instr,void * _options)606 lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options)
607 {
608    const nir_lower_subgroups_options *options = _options;
609 
610    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
611    switch (intrin->intrinsic) {
612    case nir_intrinsic_vote_any:
613    case nir_intrinsic_vote_all:
614       if (options->lower_vote_trivial)
615          return nir_ssa_for_src(b, intrin->src[0], 1);
616       break;
617 
618    case nir_intrinsic_vote_feq:
619    case nir_intrinsic_vote_ieq:
620       if (options->lower_vote_trivial)
621          return nir_imm_true(b);
622 
623       if (options->lower_vote_eq)
624          return lower_vote_eq(b, intrin);
625 
626       if (options->lower_to_scalar && intrin->num_components > 1)
627          return lower_vote_eq_to_scalar(b, intrin);
628       break;
629 
630    case nir_intrinsic_load_subgroup_size:
631       if (options->subgroup_size)
632          return nir_imm_int(b, options->subgroup_size);
633       break;
634 
635    case nir_intrinsic_read_invocation:
636       if (options->lower_to_scalar && intrin->num_components > 1)
637          return lower_subgroup_op_to_scalar(b, intrin, false);
638 
639       if (options->lower_read_invocation_to_cond)
640          return lower_read_invocation_to_cond(b, intrin);
641 
642       break;
643 
644    case nir_intrinsic_read_first_invocation:
645       if (options->lower_to_scalar && intrin->num_components > 1)
646          return lower_subgroup_op_to_scalar(b, intrin, false);
647       break;
648 
649    case nir_intrinsic_load_subgroup_eq_mask:
650    case nir_intrinsic_load_subgroup_ge_mask:
651    case nir_intrinsic_load_subgroup_gt_mask:
652    case nir_intrinsic_load_subgroup_le_mask:
653    case nir_intrinsic_load_subgroup_lt_mask: {
654       if (!options->lower_subgroup_masks)
655          return NULL;
656 
657       nir_ssa_def *val;
658       switch (intrin->intrinsic) {
659       case nir_intrinsic_load_subgroup_eq_mask:
660          val = build_subgroup_eq_mask(b, options);
661          break;
662       case nir_intrinsic_load_subgroup_ge_mask:
663          val = nir_iand(b, build_subgroup_ge_mask(b, options),
664                            build_subgroup_mask(b, options));
665          break;
666       case nir_intrinsic_load_subgroup_gt_mask:
667          val = nir_iand(b, build_subgroup_gt_mask(b, options),
668                            build_subgroup_mask(b, options));
669          break;
670       case nir_intrinsic_load_subgroup_le_mask:
671          val = nir_inot(b, build_subgroup_gt_mask(b, options));
672          break;
673       case nir_intrinsic_load_subgroup_lt_mask:
674          val = nir_inot(b, build_subgroup_ge_mask(b, options));
675          break;
676       default:
677          unreachable("you seriously can't tell this is unreachable?");
678       }
679 
680       return uint_to_ballot_type(b, val,
681                                  intrin->dest.ssa.num_components,
682                                  intrin->dest.ssa.bit_size);
683    }
684 
685    case nir_intrinsic_ballot: {
686       if (intrin->dest.ssa.num_components == options->ballot_components &&
687           intrin->dest.ssa.bit_size == options->ballot_bit_size)
688          return NULL;
689 
690       nir_ssa_def *ballot =
691          nir_ballot(b, options->ballot_components, options->ballot_bit_size,
692                     intrin->src[0].ssa);
693 
694       return uint_to_ballot_type(b, ballot,
695                                  intrin->dest.ssa.num_components,
696                                  intrin->dest.ssa.bit_size);
697    }
698 
699    case nir_intrinsic_ballot_bitfield_extract:
700    case nir_intrinsic_ballot_bit_count_reduce:
701    case nir_intrinsic_ballot_find_lsb:
702    case nir_intrinsic_ballot_find_msb: {
703       assert(intrin->src[0].is_ssa);
704       nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
705                                                  options);
706 
707       if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract &&
708           intrin->intrinsic != nir_intrinsic_ballot_find_lsb) {
709          /* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says:
710           *
711           *    "Find the most significant bit set to 1 in Value, considering
712           *    only the bits in Value required to represent all bits of the
713           *    group’s invocations.  If none of the considered bits is set to
714           *    1, the result is undefined."
715           *
716           * It has similar text for the other three.  This means that, in case
717           * the subgroup size is less than 32, we have to mask off the unused
718           * bits.  If the subgroup size is fixed and greater than or equal to
719           * 32, the mask will be 0xffffffff and nir_opt_algebraic will delete
720           * the iand.
721           *
722           * We only have to worry about this for BitCount and FindMSB because
723           * FindLSB counts from the bottom and BitfieldExtract selects
724           * individual bits.  In either case, if run outside the range of
725           * valid bits, we hit the undefined results case and we can return
726           * anything we want.
727           */
728          int_val = nir_iand(b, int_val, build_subgroup_mask(b, options));
729       }
730 
731       switch (intrin->intrinsic) {
732       case nir_intrinsic_ballot_bitfield_extract: {
733          assert(intrin->src[1].is_ssa);
734          nir_ssa_def *idx = intrin->src[1].ssa;
735          if (int_val->num_components > 1) {
736             /* idx will be truncated by nir_ushr, so we just need to select
737              * the right component using the bits of idx that are truncated in
738              * the shift.
739              */
740             int_val =
741                nir_vector_extract(b, int_val,
742                                   nir_udiv_imm(b, idx, int_val->bit_size));
743          }
744 
745          return nir_test_mask(b, nir_ushr(b, int_val, idx), 1);
746       }
747       case nir_intrinsic_ballot_bit_count_reduce:
748          return vec_bit_count(b, int_val);
749       case nir_intrinsic_ballot_find_lsb:
750          return vec_find_lsb(b, int_val);
751       case nir_intrinsic_ballot_find_msb:
752          return vec_find_msb(b, int_val);
753       default:
754          unreachable("you seriously can't tell this is unreachable?");
755       }
756    }
757 
758    case nir_intrinsic_ballot_bit_count_exclusive:
759    case nir_intrinsic_ballot_bit_count_inclusive: {
760       nir_ssa_def *mask;
761       if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) {
762          mask = nir_inot(b, build_subgroup_gt_mask(b, options));
763       } else {
764          mask = nir_inot(b, build_subgroup_ge_mask(b, options));
765       }
766 
767       assert(intrin->src[0].is_ssa);
768       nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
769                                                  options);
770 
771       return vec_bit_count(b, nir_iand(b, int_val, mask));
772    }
773 
774    case nir_intrinsic_elect: {
775       if (!options->lower_elect)
776          return NULL;
777 
778       return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b));
779    }
780 
781    case nir_intrinsic_shuffle:
782       if (options->lower_shuffle)
783          return lower_shuffle(b, intrin);
784       else if (options->lower_to_scalar && intrin->num_components > 1)
785          return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit);
786       else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
787          return lower_subgroup_op_to_32bit(b, intrin);
788       break;
789    case nir_intrinsic_shuffle_xor:
790    case nir_intrinsic_shuffle_up:
791    case nir_intrinsic_shuffle_down:
792       if (options->lower_relative_shuffle)
793          return lower_to_shuffle(b, intrin, options);
794       else if (options->lower_to_scalar && intrin->num_components > 1)
795          return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit);
796       else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
797          return lower_subgroup_op_to_32bit(b, intrin);
798       break;
799 
800    case nir_intrinsic_quad_broadcast:
801    case nir_intrinsic_quad_swap_horizontal:
802    case nir_intrinsic_quad_swap_vertical:
803    case nir_intrinsic_quad_swap_diagonal:
804       if (options->lower_quad ||
805           (options->lower_quad_broadcast_dynamic &&
806            intrin->intrinsic == nir_intrinsic_quad_broadcast &&
807            !nir_src_is_const(intrin->src[1])))
808          return lower_dynamic_quad_broadcast(b, intrin, options);
809       else if (options->lower_to_scalar && intrin->num_components > 1)
810          return lower_subgroup_op_to_scalar(b, intrin, false);
811       break;
812 
813    case nir_intrinsic_reduce: {
814       nir_ssa_def *ret = NULL;
815       /* A cluster size greater than the subgroup size is implemention defined */
816       if (options->subgroup_size &&
817           nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) {
818          nir_intrinsic_set_cluster_size(intrin, 0);
819          ret = NIR_LOWER_INSTR_PROGRESS;
820       }
821       if (options->lower_to_scalar && intrin->num_components > 1)
822          ret = lower_subgroup_op_to_scalar(b, intrin, false);
823       return ret;
824    }
825    case nir_intrinsic_inclusive_scan:
826    case nir_intrinsic_exclusive_scan:
827       if (options->lower_to_scalar && intrin->num_components > 1)
828          return lower_subgroup_op_to_scalar(b, intrin, false);
829       break;
830 
831    default:
832       break;
833    }
834 
835    return NULL;
836 }
837 
838 bool
nir_lower_subgroups(nir_shader * shader,const nir_lower_subgroups_options * options)839 nir_lower_subgroups(nir_shader *shader,
840                     const nir_lower_subgroups_options *options)
841 {
842    return nir_shader_lower_instructions(shader,
843                                         lower_subgroups_filter,
844                                         lower_subgroups_instr,
845                                         (void *)options);
846 }
847