• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include <stdint.h>
7 #include "util/half_float.h"
8 
9 #include "brw_fs.h"
10 #include "brw_builder.h"
11 
12 using namespace brw;
13 
14 struct brw_reduction_info {
15    brw_reg             identity;
16    enum opcode         op;
17    brw_conditional_mod cond_mod;
18 };
19 
20 static brw_reduction_info
brw_get_reduction_info(brw_reduce_op red_op,brw_reg_type type)21 brw_get_reduction_info(brw_reduce_op red_op, brw_reg_type type)
22 {
23    struct brw_reduction_info info;
24 
25    info.op = BRW_OPCODE_SEL;
26    info.cond_mod = BRW_CONDITIONAL_NONE;
27 
28    switch (red_op) {
29    case BRW_REDUCE_OP_ADD: info.op = BRW_OPCODE_ADD;           break;
30    case BRW_REDUCE_OP_MUL: info.op = BRW_OPCODE_MUL;           break;
31    case BRW_REDUCE_OP_AND: info.op = BRW_OPCODE_AND;           break;
32    case BRW_REDUCE_OP_OR:  info.op = BRW_OPCODE_OR;            break;
33    case BRW_REDUCE_OP_XOR: info.op = BRW_OPCODE_XOR;           break;
34    case BRW_REDUCE_OP_MIN: info.cond_mod = BRW_CONDITIONAL_L;  break;
35    case BRW_REDUCE_OP_MAX: info.cond_mod = BRW_CONDITIONAL_GE; break;
36    default:
37       unreachable("invalid reduce op");
38    }
39 
40    switch (red_op) {
41    case BRW_REDUCE_OP_ADD:
42    case BRW_REDUCE_OP_XOR:
43    case BRW_REDUCE_OP_OR:
44       info.identity = retype(brw_imm_u64(0), type);
45       return info;
46    case BRW_REDUCE_OP_AND:
47       info.identity = retype(brw_imm_u64(~0ull), type);
48       return info;
49    default:
50       /* Continue below. */
51       break;
52    }
53 
54    brw_reg id;
55    const unsigned size = brw_type_size_bytes(type);
56 
57    switch (red_op) {
58    case BRW_REDUCE_OP_MUL: {
59       if (brw_type_is_int(type)) {
60          id = size < 4  ? brw_imm_uw(1) :
61               size == 4 ? brw_imm_ud(1) :
62                           brw_imm_u64(1);
63       } else {
64          assert(brw_type_is_float(type));
65          id = size == 2 ? brw_imm_uw(_mesa_float_to_half(1.0)) :
66               size == 4 ? brw_imm_f(1.0) :
67                           brw_imm_df(1.0);
68       }
69       break;
70    }
71 
72    case BRW_REDUCE_OP_MIN: {
73       if (brw_type_is_uint(type)) {
74          id = brw_imm_u64(~0ull);
75       } else if (brw_type_is_sint(type)) {
76          id = size == 1 ? brw_imm_w(INT8_MAX) :
77               size == 2 ? brw_imm_w(INT16_MAX) :
78               size == 4 ? brw_imm_d(INT32_MAX) :
79                           brw_imm_q(INT64_MAX);
80       } else {
81          assert(brw_type_is_float(type));
82          id = size == 2 ? brw_imm_uw(_mesa_float_to_half(INFINITY)) :
83               size == 4 ? brw_imm_f(INFINITY) :
84                           brw_imm_df(INFINITY);
85       }
86       break;
87    }
88 
89    case BRW_REDUCE_OP_MAX: {
90       if (brw_type_is_uint(type)) {
91          id = brw_imm_u64(0);
92       } else if (brw_type_is_sint(type)) {
93          id = size == 1 ? brw_imm_w(INT8_MIN) :
94               size == 2 ? brw_imm_w(INT16_MIN) :
95               size == 4 ? brw_imm_d(INT32_MIN) :
96                           brw_imm_q(INT64_MIN);
97       } else {
98          assert(brw_type_is_float(type));
99          id = size == 2 ? brw_imm_uw(_mesa_float_to_half(-INFINITY)) :
100               size == 4 ? brw_imm_f(-INFINITY) :
101                           brw_imm_df(-INFINITY);
102       }
103       break;
104    }
105 
106    default:
107       unreachable("invalid reduce op");
108    }
109 
110    /* For some cases above (e.g. all bits zeros, all bits ones, first bit one)
111     * either the size or the signedness was ignored, so adjust the final type
112     * now.
113     *
114     * B/UB types can't have immediates, so used W/UW above and here.
115     */
116    if      (type == BRW_TYPE_UB) type = BRW_TYPE_UW;
117    else if (type == BRW_TYPE_B)  type = BRW_TYPE_W;
118 
119    info.identity = retype(id, type);
120 
121    return info;
122 }
123 
124 static void
brw_emit_scan_step(const brw_builder & bld,enum opcode opcode,brw_conditional_mod mod,const brw_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)125 brw_emit_scan_step(const brw_builder &bld, enum opcode opcode, brw_conditional_mod mod,
126                    const brw_reg &tmp,
127                    unsigned left_offset, unsigned left_stride,
128                    unsigned right_offset, unsigned right_stride)
129 {
130    brw_reg left, right;
131    left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
132    right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
133    if ((tmp.type == BRW_TYPE_Q || tmp.type == BRW_TYPE_UQ) &&
134        (!bld.shader->devinfo->has_64bit_int || bld.shader->devinfo->ver >= 20)) {
135       switch (opcode) {
136       case BRW_OPCODE_MUL:
137          /* This will get lowered by integer MUL lowering */
138          set_condmod(mod, bld.emit(opcode, right, left, right));
139          break;
140 
141       case BRW_OPCODE_SEL: {
142          /* In order for the comparisons to work out right, we need our
143           * comparisons to be strict.
144           */
145          assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
146          if (mod == BRW_CONDITIONAL_GE)
147             mod = BRW_CONDITIONAL_G;
148 
149          /* We treat the bottom 32 bits as unsigned regardless of
150           * whether or not the integer as a whole is signed.
151           */
152          brw_reg right_low = subscript(right, BRW_TYPE_UD, 0);
153          brw_reg left_low = subscript(left, BRW_TYPE_UD, 0);
154 
155          /* The upper bits get the same sign as the 64-bit type */
156          brw_reg_type type32 = brw_type_with_size(tmp.type, 32);
157          brw_reg right_high = subscript(right, type32, 1);
158          brw_reg left_high = subscript(left, type32, 1);
159 
160          /* Build up our comparison:
161           *
162           *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
163           */
164          bld.CMP(bld.null_reg_ud(), retype(left_low, BRW_TYPE_UD),
165                             retype(right_low, BRW_TYPE_UD), mod);
166          set_predicate(BRW_PREDICATE_NORMAL,
167                        bld.CMP(bld.null_reg_ud(), left_high, right_high,
168                            BRW_CONDITIONAL_EQ));
169          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
170                            bld.CMP(bld.null_reg_ud(), left_high, right_high, mod));
171 
172          /* We could use selects here or we could use predicated MOVs
173           * because the destination and second source (if it were a SEL)
174           * are the same.
175           */
176          set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_low, left_low));
177          set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(right_high, left_high));
178          break;
179       }
180 
181       default:
182          unreachable("Unsupported 64-bit scan op");
183       }
184    } else {
185       set_condmod(mod, bld.emit(opcode, right, left, right));
186    }
187 }
188 
189 static void
brw_emit_scan(const brw_builder & bld,enum opcode opcode,const brw_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)190 brw_emit_scan(const brw_builder &bld, enum opcode opcode, const brw_reg &tmp,
191               unsigned cluster_size, brw_conditional_mod mod)
192 {
193    unsigned dispatch_width = bld.dispatch_width();
194    assert(dispatch_width >= 8);
195 
196    /* The instruction splitting code isn't advanced enough to split
197     * these so we need to handle that ourselves.
198     */
199    if (dispatch_width * brw_type_size_bytes(tmp.type) > 2 * REG_SIZE) {
200       const unsigned half_width = dispatch_width / 2;
201       const brw_builder ubld = bld.exec_all().group(half_width, 0);
202       brw_reg left = tmp;
203       brw_reg right = horiz_offset(tmp, half_width);
204       brw_emit_scan(ubld, opcode, left, cluster_size, mod);
205       brw_emit_scan(ubld, opcode, right, cluster_size, mod);
206       if (cluster_size > half_width) {
207          brw_emit_scan_step(ubld, opcode, mod, tmp,
208                             half_width - 1, 0, half_width, 1);
209       }
210       return;
211    }
212 
213    if (cluster_size > 1) {
214       const brw_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
215       brw_emit_scan_step(ubld, opcode, mod, tmp, 0, 2, 1, 2);
216    }
217 
218    if (cluster_size > 2) {
219       if (brw_type_size_bytes(tmp.type) <= 4) {
220          const brw_builder ubld =
221             bld.exec_all().group(dispatch_width / 4, 0);
222          brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 2, 4);
223          brw_emit_scan_step(ubld, opcode, mod, tmp, 1, 4, 3, 4);
224       } else {
225          /* For 64-bit types, we have to do things differently because
226           * the code above would land us with destination strides that
227           * the hardware can't handle.  Fortunately, we'll only be
228           * 8-wide in that case and it's the same number of
229           * instructions.
230           */
231          const brw_builder ubld = bld.exec_all().group(2, 0);
232          for (unsigned i = 0; i < dispatch_width; i += 4)
233             brw_emit_scan_step(ubld, opcode, mod, tmp, i + 1, 0, i + 2, 1);
234       }
235    }
236 
237    for (unsigned i = 4;
238         i < MIN2(cluster_size, dispatch_width);
239         i *= 2) {
240       const brw_builder ubld = bld.exec_all().group(i, 0);
241       brw_emit_scan_step(ubld, opcode, mod, tmp, i - 1, 0, i, 1);
242 
243       if (dispatch_width > i * 2)
244          brw_emit_scan_step(ubld, opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
245 
246       if (dispatch_width > i * 4) {
247          brw_emit_scan_step(ubld, opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
248          brw_emit_scan_step(ubld, opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
249       }
250    }
251 }
252 
253 static bool
brw_lower_reduce(fs_visitor & s,bblock_t * block,fs_inst * inst)254 brw_lower_reduce(fs_visitor &s, bblock_t *block, fs_inst *inst)
255 {
256    const brw_builder bld(&s, block, inst);
257 
258    assert(inst->dst.type == inst->src[0].type);
259    brw_reg dst = inst->dst;
260    brw_reg src = inst->src[0];
261 
262    assert(inst->src[1].file == IMM);
263    enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud;
264 
265    assert(inst->src[2].file == IMM);
266    unsigned cluster_size = inst->src[2].ud;
267 
268    assert(cluster_size > 0);
269    assert(cluster_size <= s.dispatch_width);
270 
271    struct brw_reduction_info info = brw_get_reduction_info(op, src.type);
272 
273    /* Set up a register for all of our scratching around and initialize it
274     * to reduction operation's identity value.
275     */
276    brw_reg scan = bld.vgrf(src.type);
277    bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);
278 
279    brw_emit_scan(bld, info.op, scan, cluster_size, info.cond_mod);
280 
281    if (cluster_size * brw_type_size_bytes(src.type) >= REG_SIZE * 2) {
282       /* In this case, CLUSTER_BROADCAST instruction isn't needed because
283        * the distance between clusters is at least 2 GRFs.  In this case,
284        * we don't need the weird striding of the CLUSTER_BROADCAST
285        * instruction and can just do regular MOVs.
286        */
287       assert((cluster_size * brw_type_size_bytes(src.type)) % (REG_SIZE * 2) == 0);
288       const unsigned groups =
289          (s.dispatch_width * brw_type_size_bytes(src.type)) / (REG_SIZE * 2);
290       const unsigned group_size = s.dispatch_width / groups;
291       for (unsigned i = 0; i < groups; i++) {
292          const unsigned cluster = (i * group_size) / cluster_size;
293          const unsigned comp = cluster * cluster_size + (cluster_size - 1);
294          bld.group(group_size, i).MOV(horiz_offset(dst, i * group_size),
295                                       component(scan, comp));
296       }
297    } else {
298       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dst, scan,
299                brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
300    }
301    inst->remove(block);
302    return true;
303 }
304 
305 static bool
brw_lower_scan(fs_visitor & s,bblock_t * block,fs_inst * inst)306 brw_lower_scan(fs_visitor &s, bblock_t *block, fs_inst *inst)
307 {
308    const brw_builder bld(&s, block, inst);
309 
310    assert(inst->dst.type == inst->src[0].type);
311    brw_reg dst = inst->dst;
312    brw_reg src = inst->src[0];
313 
314    assert(inst->src[1].file == IMM);
315    enum brw_reduce_op op = (enum brw_reduce_op)inst->src[1].ud;
316 
317    struct brw_reduction_info info = brw_get_reduction_info(op, src.type);
318 
319    /* Set up a register for all of our scratching around and initialize it
320     * to reduction operation's identity value.
321     */
322    brw_reg scan = bld.vgrf(src.type);
323    const brw_builder ubld = bld.exec_all();
324    ubld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, info.identity);
325 
326    if (inst->opcode == SHADER_OPCODE_EXCLUSIVE_SCAN) {
327       /* Exclusive scan is a bit harder because we have to do an annoying
328        * shift of the contents before we can begin.  To make things worse,
329        * we can't do this with a normal stride; we have to use indirects.
330        */
331       brw_reg shifted = bld.vgrf(src.type);
332       brw_reg idx = bld.vgrf(BRW_TYPE_UW);
333 
334       /* Set the saturate modifier in the offset index to ensure it's
335        * normalized within the expected range without negative values,
336        * since the situation can cause us to read past the end of the
337        * register file leading to hangs on Xe3.
338        */
339       set_saturate(true, ubld.ADD(idx, bld.LOAD_SUBGROUP_INVOCATION(),
340                                   brw_imm_w(-1)));
341       ubld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
342       ubld.group(1, 0).MOV(horiz_offset(shifted, 0), info.identity);
343       scan = shifted;
344    }
345 
346    brw_emit_scan(bld, info.op, scan, s.dispatch_width, info.cond_mod);
347 
348    bld.MOV(dst, scan);
349 
350    inst->remove(block);
351    return true;
352 }
353 
354 static brw_reg
brw_fill_flag(const brw_builder & bld,unsigned v)355 brw_fill_flag(const brw_builder &bld, unsigned v)
356 {
357    const brw_builder ubld1 = bld.exec_all().group(1, 0);
358    brw_reg flag = brw_flag_reg(0, 0);
359 
360    if (bld.shader->dispatch_width == 32) {
361       /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
362       flag = retype(flag, BRW_TYPE_UD);
363       ubld1.MOV(flag, brw_imm_ud(v));
364    } else {
365       ubld1.MOV(flag, brw_imm_uw(v & 0xFFFF));
366    }
367 
368    return flag;
369 }
370 
371 static void
brw_lower_dispatch_width_vote(const brw_builder & bld,enum opcode opcode,brw_reg dst,brw_reg src)372 brw_lower_dispatch_width_vote(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
373 {
374    const intel_device_info *devinfo = bld.shader->devinfo;
375    const unsigned dispatch_width = bld.shader->dispatch_width;
376 
377    assert(opcode == SHADER_OPCODE_VOTE_ANY ||
378           opcode == SHADER_OPCODE_VOTE_ALL ||
379           opcode == SHADER_OPCODE_VOTE_EQUAL);
380 
381    const bool any   = opcode == SHADER_OPCODE_VOTE_ANY;
382    const bool equal = opcode == SHADER_OPCODE_VOTE_EQUAL;
383 
384    const brw_reg ref = equal ? bld.emit_uniformize(src) : brw_imm_d(0);
385 
386    /* The any/all predicates do not consider channel enables. To prevent
387     * dead channels from affecting the result, we initialize the flag with
388     * with the identity value for the logical operation.
389     */
390    brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF);
391    bld.CMP(bld.null_reg_d(), src, ref, equal ? BRW_CONDITIONAL_Z
392                                              : BRW_CONDITIONAL_NZ);
393 
394    /* For some reason, the any/all predicates don't work properly with
395     * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
396     * doesn't read the correct subset of the flag register and you end up
397     * getting garbage in the second half.  Work around this by using a pair
398     * of 1-wide MOVs and scattering the result.
399     *
400     * TODO: Check if we still need this for newer platforms.
401     */
402    const brw_builder ubld = devinfo->ver >= 20 ? bld.exec_all()
403                                                : bld.exec_all().group(1, 0);
404    brw_reg res1 = ubld.MOV(brw_imm_d(0));
405 
406    enum brw_predicate pred;
407    if (any) {
408       pred = devinfo->ver >= 20   ? XE2_PREDICATE_ANY :
409              dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
410              dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
411                                     BRW_PREDICATE_ALIGN1_ANY32H;
412    } else {
413       pred = devinfo->ver >= 20   ? XE2_PREDICATE_ALL :
414              dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
415              dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
416                                     BRW_PREDICATE_ALIGN1_ALL32H;
417    }
418    set_predicate(pred, ubld.MOV(res1, brw_imm_d(-1)));
419 
420    bld.MOV(retype(dst, BRW_TYPE_D), component(res1, 0));
421 }
422 
423 static void
brw_lower_quad_vote_gfx9(const brw_builder & bld,enum opcode opcode,brw_reg dst,brw_reg src)424 brw_lower_quad_vote_gfx9(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
425 {
426    assert(opcode == SHADER_OPCODE_VOTE_ANY || opcode == SHADER_OPCODE_VOTE_ALL);
427    const bool any = opcode == SHADER_OPCODE_VOTE_ANY;
428 
429    /* The any/all predicates do not consider channel enables. To prevent
430     * dead channels from affecting the result, we initialize the flag with
431     * with the identity value for the logical operation.
432     */
433    brw_fill_flag(bld, any ? 0 : 0xFFFFFFFF);
434    bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
435    bld.exec_all().MOV(retype(dst, BRW_TYPE_UD), brw_imm_ud(0));
436 
437    /* Before Xe2, we can use specialized predicates. */
438    const enum brw_predicate pred = any ? BRW_PREDICATE_ALIGN1_ANY4H
439                                        : BRW_PREDICATE_ALIGN1_ALL4H;
440 
441    fs_inst *mov = bld.MOV(retype(dst, BRW_TYPE_D), brw_imm_d(-1));
442    set_predicate(pred, mov);
443 }
444 
445 static void
brw_lower_quad_vote_gfx20(const brw_builder & bld,enum opcode opcode,brw_reg dst,brw_reg src)446 brw_lower_quad_vote_gfx20(const brw_builder &bld, enum opcode opcode, brw_reg dst, brw_reg src)
447 {
448    assert(opcode == SHADER_OPCODE_VOTE_ANY || opcode == SHADER_OPCODE_VOTE_ALL);
449    const bool any = opcode == SHADER_OPCODE_VOTE_ANY;
450 
451    /* This code is going to manipulate the results of flag mask, so clear it to
452     * avoid any residual value from disabled channels.
453     */
454    brw_reg flag = brw_fill_flag(bld, 0);
455 
456    /* Mask of invocations where condition is true, note that mask is
457     * replicated to each invocation.
458     */
459    bld.CMP(bld.null_reg_ud(), src, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
460    brw_reg cond_mask = bld.vgrf(BRW_TYPE_UD);
461    bld.MOV(cond_mask, flag);
462 
463    /* Mask of invocations in the quad, each invocation will get
464     * all the bits set for their quad, i.e. invocations 0-3 will have
465     * 0b...1111, invocations 4-7 will have 0b...11110000 and so on.
466     */
467    brw_reg invoc_ud = bld.vgrf(BRW_TYPE_UD);
468    bld.MOV(invoc_ud, bld.LOAD_SUBGROUP_INVOCATION());
469    brw_reg quad_mask =
470       bld.SHL(brw_imm_ud(0xF), bld.AND(invoc_ud, brw_imm_ud(0xFFFFFFFC)));
471 
472    /* An invocation will have bits set for each quad that passes the
473     * condition.  This is uniform among each quad.
474     */
475    brw_reg tmp = bld.AND(cond_mask, quad_mask);
476 
477    if (any) {
478       bld.CMP(retype(dst, BRW_TYPE_UD), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
479    } else {
480       /* Filter out quad_mask to include only active channels. */
481       brw_reg active = bld.vgrf(BRW_TYPE_UD);
482       bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, active);
483       bld.MOV(active, brw_reg(component(active, 0)));
484       bld.AND(quad_mask, quad_mask, active);
485 
486       bld.CMP(retype(dst, BRW_TYPE_UD), tmp, quad_mask, BRW_CONDITIONAL_Z);
487    }
488 }
489 
490 static bool
brw_lower_vote(fs_visitor & s,bblock_t * block,fs_inst * inst)491 brw_lower_vote(fs_visitor &s, bblock_t *block, fs_inst *inst)
492 {
493    const brw_builder bld(&s, block, inst);
494 
495    brw_reg dst = inst->dst;
496    brw_reg src = inst->src[0];
497 
498    unsigned cluster_size;
499    if (inst->sources > 1) {
500       assert(inst->src[1].file == IMM);
501       cluster_size = inst->src[1].ud;
502    } else {
503       cluster_size = s.dispatch_width;
504    }
505 
506    if (cluster_size == s.dispatch_width) {
507       brw_lower_dispatch_width_vote(bld, inst->opcode, dst, src);
508    } else {
509       assert(cluster_size == 4);
510       if (s.devinfo->ver < 20)
511          brw_lower_quad_vote_gfx9(bld, inst->opcode, dst, src);
512       else
513          brw_lower_quad_vote_gfx20(bld, inst->opcode, dst, src);
514    }
515 
516    inst->remove(block);
517    return true;
518 }
519 
520 static bool
brw_lower_ballot(fs_visitor & s,bblock_t * block,fs_inst * inst)521 brw_lower_ballot(fs_visitor &s, bblock_t *block, fs_inst *inst)
522 {
523    const brw_builder bld(&s, block, inst);
524 
525    brw_reg value = retype(inst->src[0], BRW_TYPE_UD);
526    brw_reg dst = inst->dst;
527 
528    const brw_builder xbld = dst.is_scalar ? bld.scalar_group() : bld;
529 
530    if (value.file == IMM) {
531       /* Implement a fast-path for ballot(true). */
532       if (!value.is_zero()) {
533          brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
534          bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, tmp);
535          xbld.MOV(dst, brw_reg(component(tmp, 0)));
536       } else {
537          brw_reg zero = retype(brw_imm_uq(0), dst.type);
538          xbld.MOV(dst, zero);
539       }
540    } else {
541       brw_reg flag = brw_fill_flag(bld, 0);
542       bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
543       xbld.MOV(dst, flag);
544    }
545 
546    inst->remove(block);
547    return true;
548 }
549 
550 static bool
brw_lower_quad_swap(fs_visitor & s,bblock_t * block,fs_inst * inst)551 brw_lower_quad_swap(fs_visitor &s, bblock_t *block, fs_inst *inst)
552 {
553    const brw_builder bld(&s, block, inst);
554 
555    assert(inst->dst.type == inst->src[0].type);
556    brw_reg dst = inst->dst;
557    brw_reg value = inst->src[0];
558 
559    assert(inst->src[1].file == IMM);
560    enum brw_swap_direction dir = (enum brw_swap_direction)inst->src[1].ud;
561 
562    switch (dir) {
563    case BRW_SWAP_HORIZONTAL: {
564       const brw_reg tmp = bld.vgrf(value.type);
565 
566       const brw_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
567 
568       const brw_reg src_left = horiz_stride(value, 2);
569       const brw_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
570       const brw_reg tmp_left = horiz_stride(tmp, 2);
571       const brw_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
572 
573       ubld.MOV(tmp_left, src_right);
574       ubld.MOV(tmp_right, src_left);
575 
576       bld.MOV(retype(dst, value.type), tmp);
577       break;
578    }
579    case BRW_SWAP_VERTICAL:
580    case BRW_SWAP_DIAGONAL: {
581       if (brw_type_size_bits(value.type) == 32) {
582          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
583          const unsigned swizzle = dir == BRW_SWAP_VERTICAL ? BRW_SWIZZLE4(2,3,0,1)
584                                                            : BRW_SWIZZLE4(3,2,1,0);
585          const brw_reg tmp = bld.vgrf(value.type);
586          const brw_builder ubld = bld.exec_all();
587          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, brw_imm_ud(swizzle));
588          bld.MOV(dst, tmp);
589       } else {
590          /* For larger data types, we have to either emit dispatch_width many
591           * MOVs or else fall back to doing indirects.
592           */
593          const unsigned xor_mask = dir == BRW_SWAP_VERTICAL ? 0x2 : 0x3;
594          brw_reg idx = bld.vgrf(BRW_TYPE_W);
595          bld.XOR(idx, bld.LOAD_SUBGROUP_INVOCATION(), brw_imm_w(xor_mask));
596          bld.emit(SHADER_OPCODE_SHUFFLE, dst, value, idx);
597       }
598       break;
599    }
600    }
601 
602    inst->remove(block);
603    return true;
604 }
605 
606 static bool
brw_lower_read_from_live_channel(fs_visitor & s,bblock_t * block,fs_inst * inst)607 brw_lower_read_from_live_channel(fs_visitor &s, bblock_t *block, fs_inst *inst)
608 {
609    const brw_builder bld(&s, block, inst);
610 
611    assert(inst->sources == 1);
612    assert(inst->dst.type == inst->src[0].type);
613    brw_reg dst = inst->dst;
614    brw_reg value = inst->src[0];
615 
616    bld.MOV(dst, bld.emit_uniformize(value));
617 
618    inst->remove(block);
619    return true;
620 }
621 
622 static bool
brw_lower_read_from_channel(fs_visitor & s,bblock_t * block,fs_inst * inst)623 brw_lower_read_from_channel(fs_visitor &s, bblock_t *block, fs_inst *inst)
624 {
625    const brw_builder bld(&s, block, inst);
626 
627    assert(inst->sources == 2);
628    assert(inst->dst.type == inst->src[0].type);
629 
630    brw_reg dst = inst->dst;
631    brw_reg value = inst->src[0];
632    brw_reg index = retype(inst->src[1], BRW_TYPE_UD);
633 
634    /* When for some reason the subgroup_size picked by NIR is larger than
635     * the dispatch size picked by the backend (this could happen in RT,
636     * FS), bound the invocation to the dispatch size.
637     */
638    const unsigned dispatch_width_mask = s.dispatch_width - 1;
639 
640    if (index.file == IMM) {
641       /* Always apply mask here since it is cheap. */
642       bld.MOV(dst, component(value, index.ud & dispatch_width_mask));
643    } else {
644       if (s.api_subgroup_size == 0 || s.dispatch_width < s.api_subgroup_size)
645          index = bld.AND(index, brw_imm_ud(dispatch_width_mask));
646 
647       brw_reg tmp = bld.BROADCAST(value, bld.emit_uniformize(index));
648       bld.MOV(dst, tmp);
649    }
650 
651    inst->remove(block);
652    return true;
653 }
654 
655 bool
brw_lower_subgroup_ops(fs_visitor & s)656 brw_lower_subgroup_ops(fs_visitor &s)
657 {
658    bool progress = false;
659 
660    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
661       switch (inst->opcode) {
662       case SHADER_OPCODE_REDUCE:
663          progress |= brw_lower_reduce(s, block, inst);
664          break;
665 
666       case SHADER_OPCODE_INCLUSIVE_SCAN:
667       case SHADER_OPCODE_EXCLUSIVE_SCAN:
668          progress |= brw_lower_scan(s, block, inst);
669          break;
670 
671       case SHADER_OPCODE_VOTE_ANY:
672       case SHADER_OPCODE_VOTE_ALL:
673       case SHADER_OPCODE_VOTE_EQUAL:
674          progress |= brw_lower_vote(s, block, inst);
675          break;
676 
677       case SHADER_OPCODE_BALLOT:
678          progress |= brw_lower_ballot(s, block, inst);
679          break;
680 
681       case SHADER_OPCODE_QUAD_SWAP:
682          progress |= brw_lower_quad_swap(s, block, inst);
683          break;
684 
685       case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
686          progress |= brw_lower_read_from_live_channel(s, block, inst);
687          break;
688 
689       case SHADER_OPCODE_READ_FROM_CHANNEL:
690          progress |= brw_lower_read_from_channel(s, block, inst);
691          break;
692 
693       default:
694          /* Nothing to do. */
695          break;
696       }
697    }
698 
699    if (progress)
700       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
701 
702    return progress;
703 }
704