• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_builder.h"
8 
9 using namespace brw;
10 
11 /**
12  * Align16 3-source instructions cannot have scalar stride w/64-bit types.
13  *
14  * The Bspec says:
15  *
16  *    Replicate Control. This field is only present in three-source
17  *    instructions, for each of the three source operands. It controls
18  *    replication of the starting channel to all channels in the execution
19  *    size. ChanSel does not apply when Replicate Control is set. This is
20  *    applicable to 32b datatypes and 16b datatype. 64b datatypes cannot use
21  *    the replicate control.
22  *
23  * In practice, this can only happen on Gfx9 with DF sources to MAD.  Since
24  * the source is_scalar, this can be fixed by just making the stride=1. Also
25  * clear is_scalar "just in case."
26  */
27 bool
brw_lower_scalar_fp64_MAD(fs_visitor & s)28 brw_lower_scalar_fp64_MAD(fs_visitor &s)
29 {
30    const intel_device_info *devinfo = s.devinfo;
31    bool progress = false;
32 
33    if (devinfo->ver != 9)
34       return false;
35 
36    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
37       if (inst->opcode == BRW_OPCODE_MAD &&
38           inst->dst.type == BRW_TYPE_DF) {
39          for (unsigned i = 0; i < 3; i++) {
40             if (inst->src[i].is_scalar) {
41                inst->src[i].is_scalar = false;
42                inst->src[i].stride = 1;
43                progress = true;
44             }
45          }
46       }
47    }
48 
49    return progress;
50 }
51 
52 bool
brw_lower_load_payload(fs_visitor & s)53 brw_lower_load_payload(fs_visitor &s)
54 {
55    bool progress = false;
56 
57    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
58       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
59          continue;
60 
61       assert(inst->dst.file == VGRF);
62       assert(inst->saturate == false);
63       brw_reg dst = inst->dst;
64 
65       const brw_builder ibld(&s, block, inst);
66       const brw_builder ubld = ibld.exec_all();
67 
68       for (uint8_t i = 0; i < inst->header_size;) {
69          /* Number of header GRFs to initialize at once with a single MOV
70           * instruction.
71           */
72          const unsigned n =
73             (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
74              inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
75             2 : 1;
76 
77          if (inst->src[i].file != BAD_FILE)
78             ubld.group(8 * n, 0).MOV(retype(dst, BRW_TYPE_UD),
79                                      retype(inst->src[i], BRW_TYPE_UD));
80 
81          dst = byte_offset(dst, n * REG_SIZE);
82          i += n;
83       }
84 
85       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
86          dst.type = inst->src[i].type;
87          if (inst->src[i].file != BAD_FILE) {
88             ibld.MOV(dst, inst->src[i]);
89          }
90          dst = offset(dst, ibld, 1);
91       }
92 
93       inst->remove(block);
94       progress = true;
95    }
96 
97    if (progress)
98       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
99 
100    return progress;
101 }
102 
103 /**
104  * Lower CSEL with unsupported types to CMP+SEL.
105  *
106  * Or, for unsigned ==/!= comparisons, simply change the types.
107  */
108 bool
brw_lower_csel(fs_visitor & s)109 brw_lower_csel(fs_visitor &s)
110 {
111    const intel_device_info *devinfo = s.devinfo;
112    bool progress = false;
113 
114    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
115       if (inst->opcode != BRW_OPCODE_CSEL)
116          continue;
117 
118       bool supported = false;
119       enum brw_reg_type orig_type = inst->src[2].type;
120       enum brw_reg_type new_type = orig_type;
121 
122       switch (orig_type) {
123       case BRW_TYPE_F:
124          /* Gfx9 CSEL can only do F */
125          supported = true;
126          break;
127       case BRW_TYPE_HF:
128       case BRW_TYPE_W:
129       case BRW_TYPE_D:
130          /* Gfx11+ CSEL can do HF, W, and D.  Note that we can't simply
131           * retype integer ==/!= comparisons as float on earlier hardware
132           * because it breaks for 0x8000000 and 0 (-0.0 == 0.0).
133           */
134          supported = devinfo->ver >= 11;
135          break;
136       case BRW_TYPE_UW:
137       case BRW_TYPE_UD:
138          /* CSEL doesn't support UW/UD but we can simply retype to use the
139           * signed types when comparing with == or !=.
140           */
141          supported = devinfo->ver >= 11 &&
142                      (inst->conditional_mod == BRW_CONDITIONAL_EQ ||
143                       inst->conditional_mod == BRW_CONDITIONAL_NEQ);
144 
145          /* Bspec 47408, Gfx125+ CSEL does support the both signed and unsigned
146           * integer types.
147           */
148          if (devinfo->verx10 < 125) {
149             new_type = inst->src[2].type == BRW_TYPE_UD ?
150                        BRW_TYPE_D : BRW_TYPE_W;
151          }
152          break;
153       default:
154          break;
155       }
156 
157       if (!supported) {
158          const brw_builder ibld(&s, block, inst);
159 
160          /* CSEL: dst = src2 <op> 0 ? src0 : src1 */
161          brw_reg zero = brw_imm_reg(orig_type);
162          ibld.CMP(retype(brw_null_reg(), orig_type),
163                   inst->src[2], zero, inst->conditional_mod);
164 
165          inst->opcode = BRW_OPCODE_SEL;
166          inst->predicate = BRW_PREDICATE_NORMAL;
167          inst->conditional_mod = BRW_CONDITIONAL_NONE;
168          inst->resize_sources(2);
169          progress = true;
170       } else if (new_type != orig_type) {
171          inst->src[0].type = new_type;
172          inst->src[1].type = new_type;
173          inst->src[2].type = new_type;
174          progress = true;
175       }
176    }
177 
178    if (progress)
179       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
180 
181    return progress;
182 }
183 
184 bool
brw_lower_sub_sat(fs_visitor & s)185 brw_lower_sub_sat(fs_visitor &s)
186 {
187    bool progress = false;
188 
189    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
190       const brw_builder ibld(&s, block, inst);
191 
192       if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
193           inst->opcode == SHADER_OPCODE_ISUB_SAT) {
194          /* The fundamental problem is the hardware performs source negation
195           * at the bit width of the source.  If the source is 0x80000000D, the
196           * negation is 0x80000000D.  As a result, subtractSaturate(0,
197           * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
198           * are at least three ways to resolve this:
199           *
200           * 1. Use the accumulator for the negated source.  The accumulator is
201           *    33 bits, so our source 0x80000000 is sign-extended to
202           *    0x1800000000.  The negation of which is 0x080000000.  This
203           *    doesn't help for 64-bit integers (which are already bigger than
204           *    33 bits).  There are also only 8 accumulators, so SIMD16 or
205           *    SIMD32 instructions would have to be split into multiple SIMD8
206           *    instructions.
207           *
208           * 2. Use slightly different math.  For any n-bit value x, we know (x
209           *    >> 1) != -(x >> 1).  We can use this fact to only do
210           *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
211           *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
212           *
213           * 3. For unsigned sources, it is sufficient to replace the
214           *    subtractSaturate with (a > b) ? a - b : 0.
215           *
216           * It may also be possible to use the SUBB instruction.  This
217           * implicitly writes the accumulator, so it could only be used in the
218           * same situations as #1 above.  It is further limited by only
219           * allowing UD sources.
220           */
221          if (inst->exec_size == 8 && inst->src[0].type != BRW_TYPE_Q &&
222              inst->src[0].type != BRW_TYPE_UQ) {
223             brw_reg acc = retype(brw_acc_reg(inst->exec_size),
224                                 inst->src[1].type);
225 
226             ibld.MOV(acc, inst->src[1]);
227             fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
228             add->saturate = true;
229             add->src[0].negate = true;
230          } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
231             /* tmp = src1 >> 1;
232              * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
233              */
234             fs_inst *add;
235 
236             brw_reg tmp = ibld.vgrf(inst->src[0].type);
237             ibld.SHR(tmp, inst->src[1], brw_imm_d(1));
238 
239             brw_reg s1_sub_t = ibld.ADD(inst->src[1], negate(tmp));
240             brw_reg sat_s0_sub_t = ibld.ADD(inst->src[0], negate(tmp), &add);
241             add->saturate = true;
242 
243             add = ibld.ADD(inst->dst, sat_s0_sub_t, negate(s1_sub_t));
244             add->saturate = true;
245          } else {
246             /* a > b ? a - b : 0 */
247             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
248                      BRW_CONDITIONAL_G);
249 
250             fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
251             add->src[1].negate = !add->src[1].negate;
252 
253             ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
254                ->predicate = BRW_PREDICATE_NORMAL;
255          }
256 
257          inst->remove(block);
258          progress = true;
259       }
260    }
261 
262    if (progress)
263       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
264 
265    return progress;
266 }
267 
268 /**
269  * Transform barycentric vectors into the interleaved form expected by the PLN
270  * instruction and returned by the Gfx7+ PI shared function.
271  *
272  * For channels 0-15 in SIMD16 mode they are expected to be laid out as
273  * follows in the register file:
274  *
275  *    rN+0: X[0-7]
276  *    rN+1: Y[0-7]
277  *    rN+2: X[8-15]
278  *    rN+3: Y[8-15]
279  *
280  * There is no need to handle SIMD32 here -- This is expected to be run after
281  * SIMD lowering, since SIMD lowering relies on vectors having the standard
282  * component layout.
283  */
284 bool
brw_lower_barycentrics(fs_visitor & s)285 brw_lower_barycentrics(fs_visitor &s)
286 {
287    const intel_device_info *devinfo = s.devinfo;
288 
289    if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
290       return false;
291 
292    bool progress = false;
293 
294    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
295       if (inst->exec_size < 16)
296          continue;
297 
298       const brw_builder ibld(&s, block, inst);
299       const brw_builder ubld = ibld.exec_all().group(8, 0);
300 
301       switch (inst->opcode) {
302       case BRW_OPCODE_PLN: {
303          assert(inst->exec_size == 16);
304          const brw_reg tmp = ibld.vgrf(inst->src[1].type, 2);
305          brw_reg srcs[4];
306 
307          for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
308             srcs[i] = horiz_offset(offset(inst->src[1], ibld, i % 2),
309                                    8 * (i / 2));
310 
311          ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
312 
313          inst->src[1] = tmp;
314          progress = true;
315          break;
316       }
317       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
318       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
319       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
320          assert(inst->exec_size == 16);
321          const brw_reg tmp = ibld.vgrf(inst->dst.type, 2);
322 
323          for (unsigned i = 0; i < 2; i++) {
324             for (unsigned g = 0; g < inst->exec_size / 8; g++) {
325                fs_inst *mov = ibld.at(block, inst->next).group(8, g)
326                                   .MOV(horiz_offset(offset(inst->dst, ibld, i),
327                                                     8 * g),
328                                        offset(tmp, ubld, 2 * g + i));
329                mov->predicate = inst->predicate;
330                mov->predicate_inverse = inst->predicate_inverse;
331                mov->flag_subreg = inst->flag_subreg;
332             }
333          }
334 
335          inst->dst = tmp;
336          progress = true;
337          break;
338       }
339       default:
340          break;
341       }
342    }
343 
344    if (progress)
345       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
346 
347    return progress;
348 }
349 
350 /**
351  * Lower a derivative instruction as the floating-point difference of two
352  * swizzles of the source, specified as \p swz0 and \p swz1.
353  */
354 static bool
lower_derivative(fs_visitor & s,bblock_t * block,fs_inst * inst,unsigned swz0,unsigned swz1)355 lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
356                  unsigned swz0, unsigned swz1)
357 {
358    const brw_builder ubld = brw_builder(&s, block, inst).exec_all();
359    const brw_reg tmp0 = ubld.vgrf(inst->src[0].type);
360    const brw_reg tmp1 = ubld.vgrf(inst->src[0].type);
361 
362    ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
363    ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
364 
365    inst->resize_sources(2);
366    inst->src[0] = negate(tmp0);
367    inst->src[1] = tmp1;
368    inst->opcode = BRW_OPCODE_ADD;
369 
370    return true;
371 }
372 
373 /**
374  * Lower derivative instructions on platforms where codegen cannot implement
375  * them efficiently (i.e. XeHP).
376  */
377 bool
brw_lower_derivatives(fs_visitor & s)378 brw_lower_derivatives(fs_visitor &s)
379 {
380    bool progress = false;
381 
382    if (s.devinfo->verx10 < 125)
383       return false;
384 
385    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
386       if (inst->opcode == FS_OPCODE_DDX_COARSE)
387          progress |= lower_derivative(s, block, inst,
388                                       BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
389 
390       else if (inst->opcode == FS_OPCODE_DDX_FINE)
391          progress |= lower_derivative(s, block, inst,
392                                       BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
393 
394       else if (inst->opcode == FS_OPCODE_DDY_COARSE)
395          progress |= lower_derivative(s, block, inst,
396                                       BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
397 
398       else if (inst->opcode == FS_OPCODE_DDY_FINE)
399          progress |= lower_derivative(s, block, inst,
400                                       BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
401    }
402 
403    if (progress)
404       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
405 
406    return progress;
407 }
408 
409 bool
brw_lower_find_live_channel(fs_visitor & s)410 brw_lower_find_live_channel(fs_visitor &s)
411 {
412    bool progress = false;
413 
414    bool packed_dispatch =
415       brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
416                                     s.prog_data);
417    bool vmask =
418       s.stage == MESA_SHADER_FRAGMENT &&
419       brw_wm_prog_data(s.prog_data)->uses_vmask;
420 
421    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
422       if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
423           inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
424           inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
425          continue;
426 
427       bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
428 
429       /* Getting the first active channel index is easy on Gfx8: Just find
430        * the first bit set in the execution mask.  The register exists on
431        * HSW already but it reads back as all ones when the current
432        * instruction has execution masking disabled, so it's kind of
433        * useless there.
434        */
435 
436       const brw_builder ibld(&s, block, inst);
437       if (!inst->is_partial_write())
438          ibld.emit_undef_for_dst(inst);
439 
440       const brw_builder ubld = brw_builder(&s, block, inst).exec_all().group(1, 0);
441 
442       brw_reg exec_mask = ubld.vgrf(BRW_TYPE_UD);
443       ubld.UNDEF(exec_mask);
444       ubld.emit(SHADER_OPCODE_READ_ARCH_REG, exec_mask,
445                                              retype(brw_mask_reg(0),
446                                                     BRW_TYPE_UD));
447 
448       /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
449        * so combine the execution and dispatch masks to obtain the true mask.
450        *
451        * If we're looking for the first live channel, and we have packed
452        * dispatch, we can skip this step, as we know all dispatched channels
453        * will appear at the front of the mask.
454        */
455       if (!(first && packed_dispatch)) {
456          brw_reg mask = ubld.vgrf(BRW_TYPE_UD);
457          ubld.UNDEF(mask);
458          ubld.emit(SHADER_OPCODE_READ_ARCH_REG, mask,
459                                                 retype(brw_sr0_reg(vmask ? 3 : 2),
460                                                        BRW_TYPE_UD));
461 
462          /* Quarter control has the effect of magically shifting the value of
463           * ce0 so you'll get the first/last active channel relative to the
464           * specified quarter control as result.
465           */
466          if (inst->group > 0)
467             ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
468 
469          ubld.AND(mask, exec_mask, mask);
470          exec_mask = mask;
471       }
472 
473       switch (inst->opcode) {
474       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
475          ubld.FBL(inst->dst, exec_mask);
476          break;
477 
478       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
479          brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
480          ubld.UNDEF(tmp);
481          ubld.LZD(tmp, exec_mask);
482          ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
483          break;
484       }
485 
486       case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
487          ubld.MOV(inst->dst, exec_mask);
488          break;
489 
490       default:
491          unreachable("Impossible.");
492       }
493 
494       inst->remove(block);
495       progress = true;
496    }
497 
498    if (progress)
499       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
500 
501    return progress;
502 }
503 
504 /**
505  * From the Skylake PRM Vol. 2a docs for sends:
506  *
507  *    "It is required that the second block of GRFs does not overlap with the
508  *    first block."
509  *
510  * There are plenty of cases where we may accidentally violate this due to
511  * having, for instance, both sources be the constant 0.  This little pass
512  * just adds a new vgrf for the second payload and copies it over.
513  */
514 bool
brw_lower_sends_overlapping_payload(fs_visitor & s)515 brw_lower_sends_overlapping_payload(fs_visitor &s)
516 {
517    bool progress = false;
518 
519    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
520       if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
521           regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
522                           inst->src[3], inst->ex_mlen * REG_SIZE)) {
523          const unsigned arg = inst->mlen < inst->ex_mlen ? 2 : 3;
524          const unsigned len = MIN2(inst->mlen, inst->ex_mlen);
525 
526          brw_reg tmp = brw_vgrf(s.alloc.allocate(len),
527                                BRW_TYPE_UD);
528 
529          /* Sadly, we've lost all notion of channels and bit sizes at this
530           * point.  Just WE_all it.
531           */
532          const brw_builder ibld = brw_builder(&s, block, inst).exec_all().group(16, 0);
533          brw_reg copy_src = retype(inst->src[arg], BRW_TYPE_UD);
534          brw_reg copy_dst = tmp;
535          for (unsigned i = 0; i < len; i += 2) {
536             if (len == i + 1) {
537                /* Only one register left; do SIMD8 */
538                ibld.group(8, 0).MOV(copy_dst, copy_src);
539             } else {
540                ibld.MOV(copy_dst, copy_src);
541             }
542             copy_src = offset(copy_src, ibld, 1);
543             copy_dst = offset(copy_dst, ibld, 1);
544          }
545          inst->src[arg] = tmp;
546          progress = true;
547       }
548    }
549 
550    if (progress)
551       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
552 
553    return progress;
554 }
555 
556 /**
557  * Three source instruction must have a GRF destination register.
558  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
559  */
560 bool
brw_lower_3src_null_dest(fs_visitor & s)561 brw_lower_3src_null_dest(fs_visitor &s)
562 {
563    bool progress = false;
564 
565    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
566       if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
567          inst->dst = brw_vgrf(s.alloc.allocate(s.dispatch_width / 8),
568                               inst->dst.type);
569          progress = true;
570       }
571    }
572 
573    if (progress)
574       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
575                             DEPENDENCY_VARIABLES);
576 
577    return progress;
578 }
579 
580 static bool
unsupported_64bit_type(const intel_device_info * devinfo,enum brw_reg_type type)581 unsupported_64bit_type(const intel_device_info *devinfo,
582                        enum brw_reg_type type)
583 {
584    return (!devinfo->has_64bit_float && type == BRW_TYPE_DF) ||
585           (!devinfo->has_64bit_int && (type == BRW_TYPE_UQ ||
586                                        type == BRW_TYPE_Q));
587 }
588 
589 /**
590  * Perform lowering to legalize the IR for various ALU restrictions.
591  *
592  * For example:
593  * - Splitting 64-bit MOV/SEL into 2x32-bit where needed
594  */
595 bool
brw_lower_alu_restrictions(fs_visitor & s)596 brw_lower_alu_restrictions(fs_visitor &s)
597 {
598    const intel_device_info *devinfo = s.devinfo;
599    bool progress = false;
600 
601    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
602       switch (inst->opcode) {
603       case BRW_OPCODE_MOV:
604          if (unsupported_64bit_type(devinfo, inst->dst.type)) {
605             assert(inst->dst.type == inst->src[0].type);
606             assert(!inst->saturate);
607             assert(!inst->src[0].abs);
608             assert(!inst->src[0].negate);
609             const brw_builder ibld(&s, block, inst);
610 
611             enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
612 
613             if (!inst->is_partial_write())
614                ibld.emit_undef_for_dst(inst);
615 
616             ibld.MOV(subscript(inst->dst, type, 1),
617                      subscript(inst->src[0], type, 1));
618             ibld.MOV(subscript(inst->dst, type, 0),
619                      subscript(inst->src[0], type, 0));
620 
621             inst->remove(block);
622             progress = true;
623          }
624          break;
625 
626       case BRW_OPCODE_SEL:
627          if (unsupported_64bit_type(devinfo, inst->dst.type)) {
628             assert(inst->dst.type == inst->src[0].type);
629             assert(!inst->saturate);
630             assert(!inst->src[0].abs && !inst->src[0].negate);
631             assert(!inst->src[1].abs && !inst->src[1].negate);
632             assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
633             const brw_builder ibld(&s, block, inst);
634 
635             enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
636 
637             if (!inst->is_partial_write())
638                ibld.emit_undef_for_dst(inst);
639 
640             set_predicate(inst->predicate,
641                           ibld.SEL(subscript(inst->dst, type, 0),
642                                    subscript(inst->src[0], type, 0),
643                                    subscript(inst->src[1], type, 0)));
644             set_predicate(inst->predicate,
645                           ibld.SEL(subscript(inst->dst, type, 1),
646                                    subscript(inst->src[0], type, 1),
647                                    subscript(inst->src[1], type, 1)));
648 
649             inst->remove(block);
650             progress = true;
651          }
652          break;
653 
654       default:
655          break;
656       }
657    }
658 
659    if (progress) {
660       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
661                             DEPENDENCY_INSTRUCTION_DETAIL);
662    }
663 
664    return progress;
665 }
666 
667 static void
brw_lower_vgrf_to_fixed_grf(const struct intel_device_info * devinfo,fs_inst * inst,brw_reg * reg,bool compressed)668 brw_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst,
669                             brw_reg *reg, bool compressed)
670 {
671    if (reg->file != VGRF)
672       return;
673 
674    struct brw_reg new_reg;
675 
676    if (reg->stride == 0) {
677       new_reg = brw_vec1_grf(reg->nr, 0);
678    } else if (reg->stride > 4) {
679       assert(reg != &inst->dst);
680       assert(reg->stride * brw_type_size_bytes(reg->type) <= REG_SIZE);
681       new_reg = brw_vecn_grf(1, reg->nr, 0);
682       new_reg = stride(new_reg, reg->stride, 1, 0);
683    } else {
684       /* From the Haswell PRM:
685        *
686        *  "VertStride must be used to cross GRF register boundaries. This
687        *   rule implies that elements within a 'Width' cannot cross GRF
688        *   boundaries."
689        *
690        * The maximum width value that could satisfy this restriction is:
691        */
692       const unsigned reg_width =
693          REG_SIZE / (reg->stride * brw_type_size_bytes(reg->type));
694 
695       /* Because the hardware can only split source regions at a whole
696        * multiple of width during decompression (i.e. vertically), clamp
697        * the value obtained above to the physical execution size of a
698        * single decompressed chunk of the instruction:
699        */
700       const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE;
701       const unsigned phys_width = compressed ? inst->exec_size / 2 :
702                                   inst->exec_size;
703 
704       /* XXX - The equation above is strictly speaking not correct on
705        *       hardware that supports unbalanced GRF writes -- On Gfx9+
706        *       each decompressed chunk of the instruction may have a
707        *       different execution size when the number of components
708        *       written to each destination GRF is not the same.
709        */
710 
711       const unsigned max_hw_width = 16;
712 
713       const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
714       new_reg = brw_vecn_grf(width, reg->nr, 0);
715       new_reg = stride(new_reg, width * reg->stride, width, reg->stride);
716    }
717 
718    new_reg = retype(new_reg, reg->type);
719    new_reg = byte_offset(new_reg, reg->offset);
720    new_reg.abs = reg->abs;
721    new_reg.negate = reg->negate;
722    new_reg.is_scalar = reg->is_scalar;
723 
724    *reg = new_reg;
725 }
726 
727 void
brw_lower_vgrfs_to_fixed_grfs(fs_visitor & s)728 brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
729 {
730    assert(s.grf_used || !"Must be called after register allocation");
731 
732    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
733       /* If the instruction writes to more than one register, it needs to be
734        * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
735        * hardware figures out by itself what the right compression mode is,
736        * but we still need to know whether the instruction is compressed to
737        * set up the source register regions appropriately.
738        *
739        * XXX - This is wrong for instructions that write a single register but
740        *       read more than one which should strictly speaking be treated as
741        *       compressed.  For instructions that don't write any registers it
742        *       relies on the destination being a null register of the correct
743        *       type and regioning so the instruction is considered compressed
744        *       or not accordingly.
745        */
746 
747       const bool compressed =
748            inst->dst.component_size(inst->exec_size) > REG_SIZE;
749 
750       brw_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed);
751       for (int i = 0; i < inst->sources; i++) {
752          brw_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed);
753       }
754    }
755 
756    s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
757                          DEPENDENCY_VARIABLES);
758 }
759 
760 static brw_reg
brw_s0(enum brw_reg_type type,unsigned subnr)761 brw_s0(enum brw_reg_type type, unsigned subnr)
762 {
763    return brw_make_reg(ARF,
764                        BRW_ARF_SCALAR,
765                        subnr,
766                        0,
767                        0,
768                        type,
769                        BRW_VERTICAL_STRIDE_0,
770                        BRW_WIDTH_1,
771                        BRW_HORIZONTAL_STRIDE_0,
772                        BRW_SWIZZLE_XYZW,
773                        WRITEMASK_XYZW);
774 }
775 
776 static bool
brw_lower_send_gather_inst(fs_visitor & s,bblock_t * block,fs_inst * inst)777 brw_lower_send_gather_inst(fs_visitor &s, bblock_t *block, fs_inst *inst)
778 {
779    const intel_device_info *devinfo = s.devinfo;
780    assert(devinfo->ver >= 30);
781 
782    const unsigned unit = reg_unit(devinfo);
783    assert(unit == 2);
784 
785    assert(inst->opcode == SHADER_OPCODE_SEND_GATHER);
786    assert(inst->sources > 2);
787    assert(inst->src[2].file == BAD_FILE);
788 
789    unsigned count = 0;
790    uint8_t regs[16] = {};
791 
792    const unsigned num_payload_sources = inst->sources - 3;
793    assert(num_payload_sources > 0);
794 
795    /* Limited by Src0.Length in the SEND instruction. */
796    assert(num_payload_sources < 16);
797 
798    for (unsigned i = 3; i < inst->sources; i++) {
799       assert(inst->src[i].file == FIXED_GRF);
800       assert(inst->src[i].nr % reg_unit(devinfo) == 0);
801 
802       unsigned nr = phys_nr(devinfo, inst->src[i]);
803       assert(nr <= UINT8_MAX);
804       regs[count++] = nr;
805    }
806 
807    /* Fill out ARF scalar register with the physical register numbers
808     * and use SEND_GATHER.
809     */
810    brw_builder ubld = brw_builder(&s, block, inst).group(1, 0).exec_all();
811    for (unsigned q = 0; q < DIV_ROUND_UP(count, 8); q++) {
812       uint64_t v = 0;
813       for (unsigned i = 0; i < 8; i++) {
814          const uint64_t reg = regs[(q * 8) + i];
815          v |= reg << (8 * i);
816       }
817       ubld.MOV(brw_s0(BRW_TYPE_UQ, q), brw_imm_uq(v));
818    }
819 
820    inst->src[2] = brw_s0(BRW_TYPE_UD, 0);
821    inst->mlen = count * unit;
822 
823    return true;
824 }
825 
826 bool
brw_lower_send_gather(fs_visitor & s)827 brw_lower_send_gather(fs_visitor &s)
828 {
829    assert(s.devinfo->ver >= 30);
830    assert(s.grf_used || !"Must be called after register allocation");
831 
832    bool progress = false;
833 
834    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
835       if (inst->opcode == SHADER_OPCODE_SEND_GATHER)
836          progress |= brw_lower_send_gather_inst(s, block, inst);
837    }
838 
839    if (progress)
840       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
841                             DEPENDENCY_VARIABLES);
842 
843    return progress;
844 }
845 
846 bool
brw_lower_load_subgroup_invocation(fs_visitor & s)847 brw_lower_load_subgroup_invocation(fs_visitor &s)
848 {
849    bool progress = false;
850 
851    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
852       if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
853          continue;
854 
855       const brw_builder abld =
856          brw_builder(&s, block, inst).annotate("SubgroupInvocation");
857       const brw_builder ubld8 = abld.group(8, 0).exec_all();
858       ubld8.UNDEF(inst->dst);
859 
860       if (inst->exec_size == 8) {
861          assert(inst->dst.type == BRW_TYPE_UD);
862          brw_reg uw = retype(inst->dst, BRW_TYPE_UW);
863          ubld8.MOV(uw, brw_imm_v(0x76543210));
864          ubld8.MOV(inst->dst, uw);
865       } else {
866          assert(inst->dst.type == BRW_TYPE_UW);
867          ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
868          ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
869          if (inst->exec_size > 16) {
870             const brw_builder ubld16 = abld.group(16, 0).exec_all();
871             ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
872          }
873       }
874 
875       inst->remove(block);
876       progress = true;
877    }
878 
879    if (progress)
880       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
881 
882    return progress;
883 }
884 
885 bool
brw_lower_indirect_mov(fs_visitor & s)886 brw_lower_indirect_mov(fs_visitor &s)
887 {
888    bool progress = false;
889 
890    if (s.devinfo->ver < 20)
891       return progress;
892 
893    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
894       if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT) {
895          if (brw_type_size_bytes(inst->src[0].type) > 1 &&
896              brw_type_size_bytes(inst->dst.type) > 1) {
897             continue;
898          }
899 
900          assert(brw_type_size_bytes(inst->src[0].type) ==
901                 brw_type_size_bytes(inst->dst.type));
902 
903          const brw_builder ibld(&s, block, inst);
904 
905          /* Extract unaligned part */
906          uint16_t extra_offset = inst->src[0].offset & 0x1;
907          brw_reg offset = ibld.ADD(inst->src[1], brw_imm_uw(extra_offset));
908 
909          /* Check if offset is odd or even so that we can choose either high or
910           * low byte from the result.
911           */
912          brw_reg is_odd = ibld.AND(offset, brw_imm_ud(1));
913 
914          /* Make sure offset is word (2-bytes) aligned */
915          offset = ibld.AND(offset, brw_imm_uw(~1));
916 
917          /* Indirect addressing(vx1 and vxh) not supported with UB/B datatype for
918           * Src0, so change data type for src0 and dst to UW.
919           */
920          brw_reg dst = ibld.vgrf(BRW_TYPE_UW);
921 
922          /* Substract unaligned offset from src0 offset since we already
923           * accounted unaligned part in the indirect byte offset.
924           */
925          brw_reg start = retype(inst->src[0], BRW_TYPE_UW);
926          start.offset &= ~extra_offset;
927 
928          /* Adjust length to account extra offset. */
929          assert(inst->src[2].file == IMM);
930          brw_reg length = brw_imm_ud(inst->src[2].ud + extra_offset);
931 
932          ibld.emit(SHADER_OPCODE_MOV_INDIRECT, dst, start, offset, length);
933 
934          /* Select high byte if offset is odd otherwise select low byte. */
935          brw_reg lo = ibld.AND(dst, brw_imm_uw(0xff));
936          brw_reg hi = ibld.SHR(dst, brw_imm_uw(8));
937          brw_reg result = ibld.vgrf(BRW_TYPE_UW);
938          ibld.CSEL(result, hi, lo, is_odd, BRW_CONDITIONAL_NZ);
939 
940          /* Extra MOV needed here to convert back to the corresponding B type */
941          ibld.MOV(inst->dst, result);
942 
943          inst->remove(block);
944          progress = true;
945       }
946    }
947 
948    if (progress)
949       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
950 
951    return progress;
952 }
953