• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8 
9 using namespace brw;
10 
11 /**
12  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
13  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
14  */
15 bool
brw_fs_lower_constant_loads(fs_visitor & s)16 brw_fs_lower_constant_loads(fs_visitor &s)
17 {
18    unsigned index, pull_index;
19    bool progress = false;
20 
21    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
22       /* Set up the annotation tracking for new generated instructions. */
23       const fs_builder ibld(&s, block, inst);
24 
25       for (int i = 0; i < inst->sources; i++) {
26 	 if (inst->src[i].file != UNIFORM)
27 	    continue;
28 
29          /* We'll handle this case later */
30          if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
31             continue;
32 
33          if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
34 	    continue;
35 
36          assert(inst->src[i].stride == 0);
37 
38          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
39          const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
40          const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
41          const unsigned base = pull_index * 4;
42 
43          fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
44          srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
45          srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
46          srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);
47 
48 
49          ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
50                    srcs, PULL_UNIFORM_CONSTANT_SRCS);
51 
52          /* Rewrite the instruction to use the temporary VGRF. */
53          inst->src[i].file = VGRF;
54          inst->src[i].nr = dst.nr;
55          inst->src[i].offset = (base & (block_sz - 1)) +
56                                inst->src[i].offset % 4;
57 
58          progress = true;
59       }
60 
61       if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
62           inst->src[0].file == UNIFORM) {
63 
64          if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
65             continue;
66 
67          s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
68                                       brw_imm_ud(index),
69                                       fs_reg() /* surface_handle */,
70                                       inst->src[1],
71                                       pull_index * 4, 4, 1);
72          inst->remove(block);
73 
74          progress = true;
75       }
76    }
77    s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
78 
79    return progress;
80 }
81 
82 bool
brw_fs_lower_load_payload(fs_visitor & s)83 brw_fs_lower_load_payload(fs_visitor &s)
84 {
85    bool progress = false;
86 
87    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
88       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
89          continue;
90 
91       assert(inst->dst.file == VGRF);
92       assert(inst->saturate == false);
93       fs_reg dst = inst->dst;
94 
95       const fs_builder ibld(&s, block, inst);
96       const fs_builder ubld = ibld.exec_all();
97 
98       for (uint8_t i = 0; i < inst->header_size;) {
99          /* Number of header GRFs to initialize at once with a single MOV
100           * instruction.
101           */
102          const unsigned n =
103             (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
104              inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
105             2 : 1;
106 
107          if (inst->src[i].file != BAD_FILE)
108             ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
109                                      retype(inst->src[i], BRW_REGISTER_TYPE_UD));
110 
111          dst = byte_offset(dst, n * REG_SIZE);
112          i += n;
113       }
114 
115       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
116          dst.type = inst->src[i].type;
117          if (inst->src[i].file != BAD_FILE) {
118             ibld.MOV(dst, inst->src[i]);
119          }
120          dst = offset(dst, ibld, 1);
121       }
122 
123       inst->remove(block);
124       progress = true;
125    }
126 
127    if (progress)
128       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
129 
130    return progress;
131 }
132 
133 bool
brw_fs_lower_minmax(fs_visitor & s)134 brw_fs_lower_minmax(fs_visitor &s)
135 {
136    assert(s.devinfo->ver < 6);
137 
138    bool progress = false;
139 
140    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
141       const fs_builder ibld(&s, block, inst);
142 
143       if (inst->opcode == BRW_OPCODE_SEL &&
144           inst->predicate == BRW_PREDICATE_NONE) {
145          /* If src1 is an immediate value that is not NaN, then it can't be
146           * NaN.  In that case, emit CMP because it is much better for cmod
147           * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
148           * support HF or DF, so it is not necessary to check for those.
149           */
150          if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
151              (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
152             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
153                      inst->conditional_mod);
154          } else {
155             ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
156                       inst->conditional_mod);
157          }
158          inst->predicate = BRW_PREDICATE_NORMAL;
159          inst->conditional_mod = BRW_CONDITIONAL_NONE;
160 
161          progress = true;
162       }
163    }
164 
165    if (progress)
166       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
167 
168    return progress;
169 }
170 
171 bool
brw_fs_lower_sub_sat(fs_visitor & s)172 brw_fs_lower_sub_sat(fs_visitor &s)
173 {
174    bool progress = false;
175 
176    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
177       const fs_builder ibld(&s, block, inst);
178 
179       if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
180           inst->opcode == SHADER_OPCODE_ISUB_SAT) {
181          /* The fundamental problem is the hardware performs source negation
182           * at the bit width of the source.  If the source is 0x80000000D, the
183           * negation is 0x80000000D.  As a result, subtractSaturate(0,
184           * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
185           * are at least three ways to resolve this:
186           *
187           * 1. Use the accumulator for the negated source.  The accumulator is
188           *    33 bits, so our source 0x80000000 is sign-extended to
189           *    0x1800000000.  The negation of which is 0x080000000.  This
190           *    doesn't help for 64-bit integers (which are already bigger than
191           *    33 bits).  There are also only 8 accumulators, so SIMD16 or
192           *    SIMD32 instructions would have to be split into multiple SIMD8
193           *    instructions.
194           *
195           * 2. Use slightly different math.  For any n-bit value x, we know (x
196           *    >> 1) != -(x >> 1).  We can use this fact to only do
197           *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
198           *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
199           *
200           * 3. For unsigned sources, it is sufficient to replace the
201           *    subtractSaturate with (a > b) ? a - b : 0.
202           *
203           * It may also be possible to use the SUBB instruction.  This
204           * implicitly writes the accumulator, so it could only be used in the
205           * same situations as #1 above.  It is further limited by only
206           * allowing UD sources.
207           */
208          if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
209              inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
210             fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
211 
212             ibld.MOV(acc, inst->src[1]);
213             fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
214             add->saturate = true;
215             add->src[0].negate = true;
216          } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
217             /* tmp = src1 >> 1;
218              * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
219              */
220             fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
221             fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
222             fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
223             fs_inst *add;
224 
225             ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
226 
227             add = ibld.ADD(tmp2, inst->src[1], tmp1);
228             add->src[1].negate = true;
229 
230             add = ibld.ADD(tmp3, inst->src[0], tmp1);
231             add->src[1].negate = true;
232             add->saturate = true;
233 
234             add = ibld.ADD(inst->dst, tmp3, tmp2);
235             add->src[1].negate = true;
236             add->saturate = true;
237          } else {
238             /* a > b ? a - b : 0 */
239             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
240                      BRW_CONDITIONAL_G);
241 
242             fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
243             add->src[1].negate = !add->src[1].negate;
244 
245             ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
246                ->predicate = BRW_PREDICATE_NORMAL;
247          }
248 
249          inst->remove(block);
250          progress = true;
251       }
252    }
253 
254    if (progress)
255       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
256 
257    return progress;
258 }
259 
260 /**
261  * Transform barycentric vectors into the interleaved form expected by the PLN
262  * instruction and returned by the Gfx7+ PI shared function.
263  *
264  * For channels 0-15 in SIMD16 mode they are expected to be laid out as
265  * follows in the register file:
266  *
267  *    rN+0: X[0-7]
268  *    rN+1: Y[0-7]
269  *    rN+2: X[8-15]
270  *    rN+3: Y[8-15]
271  *
272  * There is no need to handle SIMD32 here -- This is expected to be run after
273  * SIMD lowering, since SIMD lowering relies on vectors having the standard
274  * component layout.
275  */
276 bool
brw_fs_lower_barycentrics(fs_visitor & s)277 brw_fs_lower_barycentrics(fs_visitor &s)
278 {
279    const intel_device_info *devinfo = s.devinfo;
280 
281    if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
282       return false;
283 
284    bool progress = false;
285 
286    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
287       if (inst->exec_size < 16)
288          continue;
289 
290       const fs_builder ibld(&s, block, inst);
291       const fs_builder ubld = ibld.exec_all().group(8, 0);
292 
293       switch (inst->opcode) {
294       case FS_OPCODE_LINTERP : {
295          assert(inst->exec_size == 16);
296          const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
297          fs_reg srcs[4];
298 
299          for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
300             srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
301                                    8 * (i / 2));
302 
303          ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
304 
305          inst->src[0] = tmp;
306          progress = true;
307          break;
308       }
309       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
310       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
311       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
312          assert(inst->exec_size == 16);
313          const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
314 
315          for (unsigned i = 0; i < 2; i++) {
316             for (unsigned g = 0; g < inst->exec_size / 8; g++) {
317                fs_inst *mov = ibld.at(block, inst->next).group(8, g)
318                                   .MOV(horiz_offset(offset(inst->dst, ibld, i),
319                                                     8 * g),
320                                        offset(tmp, ubld, 2 * g + i));
321                mov->predicate = inst->predicate;
322                mov->predicate_inverse = inst->predicate_inverse;
323                mov->flag_subreg = inst->flag_subreg;
324             }
325          }
326 
327          inst->dst = tmp;
328          progress = true;
329          break;
330       }
331       default:
332          break;
333       }
334    }
335 
336    if (progress)
337       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
338 
339    return progress;
340 }
341 
342 /**
343  * Lower a derivative instruction as the floating-point difference of two
344  * swizzles of the source, specified as \p swz0 and \p swz1.
345  */
346 static bool
lower_derivative(fs_visitor & s,bblock_t * block,fs_inst * inst,unsigned swz0,unsigned swz1)347 lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
348                  unsigned swz0, unsigned swz1)
349 {
350    const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
351    const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
352    const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
353 
354    ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
355    ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
356 
357    inst->resize_sources(2);
358    inst->src[0] = negate(tmp0);
359    inst->src[1] = tmp1;
360    inst->opcode = BRW_OPCODE_ADD;
361 
362    return true;
363 }
364 
365 /**
366  * Lower derivative instructions on platforms where codegen cannot implement
367  * them efficiently (i.e. XeHP).
368  */
369 bool
brw_fs_lower_derivatives(fs_visitor & s)370 brw_fs_lower_derivatives(fs_visitor &s)
371 {
372    bool progress = false;
373 
374    if (s.devinfo->verx10 < 125)
375       return false;
376 
377    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
378       if (inst->opcode == FS_OPCODE_DDX_COARSE)
379          progress |= lower_derivative(s, block, inst,
380                                       BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
381 
382       else if (inst->opcode == FS_OPCODE_DDX_FINE)
383          progress |= lower_derivative(s, block, inst,
384                                       BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
385 
386       else if (inst->opcode == FS_OPCODE_DDY_COARSE)
387          progress |= lower_derivative(s, block, inst,
388                                       BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
389 
390       else if (inst->opcode == FS_OPCODE_DDY_FINE)
391          progress |= lower_derivative(s, block, inst,
392                                       BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
393    }
394 
395    if (progress)
396       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
397 
398    return progress;
399 }
400 
401 bool
brw_fs_lower_find_live_channel(fs_visitor & s)402 brw_fs_lower_find_live_channel(fs_visitor &s)
403 {
404    bool progress = false;
405 
406    bool packed_dispatch =
407       brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
408                                     s.stage_prog_data);
409    bool vmask =
410       s.stage == MESA_SHADER_FRAGMENT &&
411       brw_wm_prog_data(s.stage_prog_data)->uses_vmask;
412 
413    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
414       if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
415           inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
416           inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
417          continue;
418 
419       bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
420 
421       /* Getting the first active channel index is easy on Gfx8: Just find
422        * the first bit set in the execution mask.  The register exists on
423        * HSW already but it reads back as all ones when the current
424        * instruction has execution masking disabled, so it's kind of
425        * useless there.
426        */
427       fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
428 
429       const fs_builder ibld(&s, block, inst);
430       if (!inst->is_partial_write())
431          ibld.emit_undef_for_dst(inst);
432 
433       const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
434 
435       /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
436        * so combine the execution and dispatch masks to obtain the true mask.
437        *
438        * If we're looking for the first live channel, and we have packed
439        * dispatch, we can skip this step, as we know all dispatched channels
440        * will appear at the front of the mask.
441        */
442       if (!(first && packed_dispatch)) {
443          fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
444          ubld.UNDEF(mask);
445          ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));
446 
447          /* Quarter control has the effect of magically shifting the value of
448           * ce0 so you'll get the first/last active channel relative to the
449           * specified quarter control as result.
450           */
451          if (inst->group > 0)
452             ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
453 
454          ubld.AND(mask, exec_mask, mask);
455          exec_mask = mask;
456       }
457 
458       switch (inst->opcode) {
459       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
460          ubld.FBL(inst->dst, exec_mask);
461          break;
462 
463       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
464          fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
465          ubld.UNDEF(tmp);
466          ubld.LZD(tmp, exec_mask);
467          ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
468          break;
469       }
470 
471       case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
472          ubld.MOV(inst->dst, exec_mask);
473          break;
474 
475       default:
476          unreachable("Impossible.");
477       }
478 
479       inst->remove(block);
480       progress = true;
481    }
482 
483    if (progress)
484       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
485 
486    return progress;
487 }
488 
489 /**
490  * From the Skylake PRM Vol. 2a docs for sends:
491  *
492  *    "It is required that the second block of GRFs does not overlap with the
493  *    first block."
494  *
495  * There are plenty of cases where we may accidentally violate this due to
496  * having, for instance, both sources be the constant 0.  This little pass
497  * just adds a new vgrf for the second payload and copies it over.
498  */
499 bool
brw_fs_lower_sends_overlapping_payload(fs_visitor & s)500 brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
501 {
502    bool progress = false;
503 
504    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
505       if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
506           regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
507                           inst->src[3], inst->ex_mlen * REG_SIZE)) {
508          fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen),
509                              BRW_REGISTER_TYPE_UD);
510          /* Sadly, we've lost all notion of channels and bit sizes at this
511           * point.  Just WE_all it.
512           */
513          const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
514          fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
515          fs_reg copy_dst = tmp;
516          for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
517             if (inst->ex_mlen == i + 1) {
518                /* Only one register left; do SIMD8 */
519                ibld.group(8, 0).MOV(copy_dst, copy_src);
520             } else {
521                ibld.MOV(copy_dst, copy_src);
522             }
523             copy_src = offset(copy_src, ibld, 1);
524             copy_dst = offset(copy_dst, ibld, 1);
525          }
526          inst->src[3] = tmp;
527          progress = true;
528       }
529    }
530 
531    if (progress)
532       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
533 
534    return progress;
535 }
536 
537 /**
538  * Three source instruction must have a GRF destination register.
539  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
540  */
541 bool
brw_fs_lower_3src_null_dest(fs_visitor & s)542 brw_fs_lower_3src_null_dest(fs_visitor &s)
543 {
544    bool progress = false;
545 
546    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
547       if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
548          inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8),
549                             inst->dst.type);
550          progress = true;
551       }
552    }
553 
554    if (progress)
555       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
556                             DEPENDENCY_VARIABLES);
557 
558    return progress;
559 }
560 
561