• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_fs.h"
25 #include "elk_cfg.h"
26 #include "elk_eu.h"
27 
28 /** @file elk_fs_cmod_propagation.cpp
29  *
30  * Implements a pass that propagates the conditional modifier from a CMP x 0.0
31  * instruction into the instruction that generated x. For instance, in this
32  * sequence
33  *
34  *    add(8)          g70<1>F    g69<8,8,1>F    4096F
35  *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
36  *
37  * we can do the comparison as part of the ADD instruction directly:
38  *
39  *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
40  *
41  * If there had been a use of the flag register and another CMP using g70
42  *
43  *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
44  *    (+f0) sel(8)    g71<F>     g72<8,8,1>F    g73<8,8,1>F
45  *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
46  *
47  * we can recognize that the CMP is generating the flag value that already
48  * exists and therefore remove the instruction.
49  */
50 
51 using namespace elk;
52 
53 static bool
cmod_propagate_cmp_to_add(const intel_device_info * devinfo,elk_bblock_t * block,elk_fs_inst * inst)54 cmod_propagate_cmp_to_add(const intel_device_info *devinfo, elk_bblock_t *block,
55                           elk_fs_inst *inst)
56 {
57    bool read_flag = false;
58    const unsigned flags_written = inst->flags_written(devinfo);
59 
60    foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
61       if (scan_inst->opcode == ELK_OPCODE_ADD &&
62           !scan_inst->is_partial_write() &&
63           scan_inst->exec_size == inst->exec_size) {
64          bool negate;
65 
66          /* A CMP is basically a subtraction.  The result of the
67           * subtraction must be the same as the result of the addition.
68           * This means that one of the operands must be negated.  So (a +
69           * b) vs (a == -b) or (a + -b) vs (a == b).
70           */
71          if ((inst->src[0].equals(scan_inst->src[0]) &&
72               inst->src[1].negative_equals(scan_inst->src[1])) ||
73              (inst->src[0].equals(scan_inst->src[1]) &&
74               inst->src[1].negative_equals(scan_inst->src[0]))) {
75             negate = false;
76          } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
77                      inst->src[1].equals(scan_inst->src[1])) ||
78                     (inst->src[0].negative_equals(scan_inst->src[1]) &&
79                      inst->src[1].equals(scan_inst->src[0]))) {
80             negate = true;
81          } else {
82             goto not_match;
83          }
84 
85          /* If the scan instruction writes a different flag register than the
86           * instruction we're trying to propagate from, bail.
87           *
88           * FINISHME: The second part of the condition may be too strong.
89           * Perhaps (scan_inst->flags_written() & flags_written) !=
90           * flags_written?
91           */
92          if (scan_inst->flags_written(devinfo) != 0 &&
93              scan_inst->flags_written(devinfo) != flags_written)
94             goto not_match;
95 
96          /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
97           *
98           *    * Note that the [post condition signal] bits generated at
99           *      the output of a compute are before the .sat.
100           *
101           * Paragraph about post_zero does not mention saturation, but
102           * testing it on actual GPUs shows that conditional modifiers
103           * are applied after saturation.
104           *
105           *    * post_zero bit: This bit reflects whether the final
106           *      result is zero after all the clamping, normalizing,
107           *      or format conversion logic.
108           *
109           * For signed types we don't care about saturation: it won't
110           * change the result of conditional modifier.
111           *
112           * For floating and unsigned types there two special cases,
113           * when we can remove inst even if scan_inst is saturated: G
114           * and LE. Since conditional modifiers are just comparisons
115           * against zero, saturating positive values to the upper
116           * limit never changes the result of comparison.
117           *
118           * For negative values:
119           * (sat(x) >  0) == (x >  0) --- false
120           * (sat(x) <= 0) == (x <= 0) --- true
121           */
122          const enum elk_conditional_mod cond =
123             negate ? elk_swap_cmod(inst->conditional_mod)
124             : inst->conditional_mod;
125 
126          if (scan_inst->saturate &&
127              (elk_reg_type_is_floating_point(scan_inst->dst.type) ||
128               elk_reg_type_is_unsigned_integer(scan_inst->dst.type)) &&
129              (cond != ELK_CONDITIONAL_G &&
130               cond != ELK_CONDITIONAL_LE))
131             goto not_match;
132 
133          /* Otherwise, try propagating the conditional. */
134          if (scan_inst->can_do_cmod() &&
135              ((!read_flag && scan_inst->conditional_mod == ELK_CONDITIONAL_NONE) ||
136               scan_inst->conditional_mod == cond)) {
137             scan_inst->conditional_mod = cond;
138             scan_inst->flag_subreg = inst->flag_subreg;
139             inst->remove(block, true);
140             return true;
141          }
142          break;
143       }
144 
145    not_match:
146       if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
147          break;
148 
149       read_flag = read_flag ||
150                   (scan_inst->flags_read(devinfo) & flags_written) != 0;
151    }
152 
153    return false;
154 }
155 
156 /**
157  * Propagate conditional modifiers from NOT instructions
158  *
159  * Attempt to convert sequences like
160  *
161  *    or(8)           g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
162  *    ...
163  *    not.nz.f0(8)    null            g78<8,8,1>UD
164  *
165  * into
166  *
167  *    or.z.f0(8)      g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
168  */
169 static bool
cmod_propagate_not(const intel_device_info * devinfo,elk_bblock_t * block,elk_fs_inst * inst)170 cmod_propagate_not(const intel_device_info *devinfo, elk_bblock_t *block,
171                    elk_fs_inst *inst)
172 {
173    const enum elk_conditional_mod cond = elk_negate_cmod(inst->conditional_mod);
174    bool read_flag = false;
175    const unsigned flags_written = inst->flags_written(devinfo);
176 
177    if (cond != ELK_CONDITIONAL_Z && cond != ELK_CONDITIONAL_NZ)
178       return false;
179 
180    foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
181       if (regions_overlap(scan_inst->dst, scan_inst->size_written,
182                           inst->src[0], inst->size_read(0))) {
183          if (scan_inst->opcode != ELK_OPCODE_OR &&
184              scan_inst->opcode != ELK_OPCODE_AND)
185             break;
186 
187          if (scan_inst->is_partial_write() ||
188              scan_inst->dst.offset != inst->src[0].offset ||
189              scan_inst->exec_size != inst->exec_size)
190             break;
191 
192          /* If the scan instruction writes a different flag register than the
193           * instruction we're trying to propagate from, bail.
194           *
195           * FINISHME: The second part of the condition may be too strong.
196           * Perhaps (scan_inst->flags_written() & flags_written) !=
197           * flags_written?
198           */
199          if (scan_inst->flags_written(devinfo) != 0 &&
200              scan_inst->flags_written(devinfo) != flags_written)
201             break;
202 
203          if (scan_inst->can_do_cmod() &&
204              ((!read_flag && scan_inst->conditional_mod == ELK_CONDITIONAL_NONE) ||
205               scan_inst->conditional_mod == cond)) {
206             scan_inst->conditional_mod = cond;
207             scan_inst->flag_subreg = inst->flag_subreg;
208             inst->remove(block, true);
209             return true;
210          }
211          break;
212       }
213 
214       if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
215          break;
216 
217       read_flag = read_flag ||
218                   (scan_inst->flags_read(devinfo) & flags_written) != 0;
219    }
220 
221    return false;
222 }
223 
224 static bool
opt_cmod_propagation_local(const intel_device_info * devinfo,elk_bblock_t * block)225 opt_cmod_propagation_local(const intel_device_info *devinfo, elk_bblock_t *block)
226 {
227    bool progress = false;
228    UNUSED int ip = block->end_ip + 1;
229 
230    foreach_inst_in_block_reverse_safe(elk_fs_inst, inst, block) {
231       ip--;
232 
233       if ((inst->opcode != ELK_OPCODE_AND &&
234            inst->opcode != ELK_OPCODE_CMP &&
235            inst->opcode != ELK_OPCODE_MOV &&
236            inst->opcode != ELK_OPCODE_NOT) ||
237           inst->predicate != ELK_PREDICATE_NONE ||
238           !inst->dst.is_null() ||
239           (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
240            inst->src[0].file != UNIFORM))
241          continue;
242 
243       /* An ABS source modifier can only be handled when processing a compare
244        * with a value other than zero.
245        */
246       if (inst->src[0].abs &&
247           (inst->opcode != ELK_OPCODE_CMP || inst->src[1].is_zero()))
248          continue;
249 
250       /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
251        * generated (for ir_unop_not in elk_fs_visitor::emit_bool_to_cond_code).
252        * Propagating those would require inverting the condition on the CMP.
253        * This changes both the flag value and the register destination of the
254        * CMP.  That result may be used elsewhere, so we can't change its value
255        * on a whim.
256        */
257       if (inst->opcode == ELK_OPCODE_AND &&
258           !(inst->src[1].is_one() &&
259             inst->conditional_mod == ELK_CONDITIONAL_NZ &&
260             !inst->src[0].negate))
261          continue;
262 
263       /* A CMP with a second source of zero can match with anything.  A CMP
264        * with a second source that is not zero can only match with an ADD
265        * instruction.
266        *
267        * Only apply this optimization to float-point sources.  It can fail for
268        * integers.  For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
269        * int(0x80000000) - 4 overflows and results in 0x7ffffffc.  that's not
270        * less than zero, so the flags get set differently than for (a < b).
271        */
272       if (inst->opcode == ELK_OPCODE_CMP && !inst->src[1].is_zero()) {
273          if (elk_reg_type_is_floating_point(inst->src[0].type) &&
274              cmod_propagate_cmp_to_add(devinfo, block, inst))
275             progress = true;
276 
277          continue;
278       }
279 
280       if (inst->opcode == ELK_OPCODE_NOT) {
281          progress = cmod_propagate_not(devinfo, block, inst) || progress;
282          continue;
283       }
284 
285       bool read_flag = false;
286       const unsigned flags_written = inst->flags_written(devinfo);
287       foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
288          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
289                              inst->src[0], inst->size_read(0))) {
290             /* If the scan instruction writes a different flag register than
291              * the instruction we're trying to propagate from, bail.
292              *
293              * FINISHME: The second part of the condition may be too strong.
294              * Perhaps (scan_inst->flags_written() & flags_written) !=
295              * flags_written?
296              */
297             if (scan_inst->flags_written(devinfo) != 0 &&
298                 scan_inst->flags_written(devinfo) != flags_written)
299                break;
300 
301             if (scan_inst->is_partial_write() ||
302                 scan_inst->dst.offset != inst->src[0].offset ||
303                 scan_inst->exec_size != inst->exec_size)
304                break;
305 
306             /* If the write mask is different we can't propagate. */
307             if (scan_inst->force_writemask_all != inst->force_writemask_all)
308                break;
309 
310             /* CMP's result is the same regardless of dest type. */
311             if (inst->conditional_mod == ELK_CONDITIONAL_NZ &&
312                 scan_inst->opcode == ELK_OPCODE_CMP &&
313                 elk_reg_type_is_integer(inst->dst.type)) {
314                inst->remove(block, true);
315                progress = true;
316                break;
317             }
318 
319             /* If the AND wasn't handled by the previous case, it isn't safe
320              * to remove it.
321              */
322             if (inst->opcode == ELK_OPCODE_AND)
323                break;
324 
325             if (inst->opcode == ELK_OPCODE_MOV) {
326                if (elk_reg_type_is_floating_point(scan_inst->dst.type)) {
327                   /* If the destination type of scan_inst is floating-point,
328                    * then:
329                    *
330                    * - The source of the MOV instruction must be the same
331                    *   type.
332                    *
333                    * - The destination of the MOV instruction must be float
334                    *   point with a size at least as large as the destination
335                    *   of inst.  Size-reducing f2f conversions could cause
336                    *   non-zero values to become zero, etc.
337                    */
338                   if (scan_inst->dst.type != inst->src[0].type)
339                      break;
340 
341                   if (!elk_reg_type_is_floating_point(inst->dst.type))
342                      break;
343 
344                   if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type))
345                      break;
346                } else {
347                   /* If the destination type of scan_inst is integer, then:
348                    *
349                    * - The source of the MOV instruction must be integer with
350                    *   the same size.
351                    *
352                    * - If the conditional modifier is Z or NZ, then the
353                    *   destination type of inst must either be floating point
354                    *   (of any size) or integer with a size at least as large
355                    *   as the destination of inst.
356                    *
357                    * - If the conditional modifier is neither Z nor NZ, then the
358                    *   destination type of inst must either be floating point
359                    *   (of any size) or integer with a size at least as large
360                    *   as the destination of inst and the same signedness.
361                    */
362                   if (!elk_reg_type_is_integer(inst->src[0].type) ||
363                       type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type))
364                      break;
365 
366                   if (elk_reg_type_is_integer(inst->dst.type)) {
367                      if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type))
368                         break;
369 
370                      if (inst->conditional_mod != ELK_CONDITIONAL_Z &&
371                          inst->conditional_mod != ELK_CONDITIONAL_NZ &&
372                          elk_reg_type_is_unsigned_integer(inst->dst.type) !=
373                          elk_reg_type_is_unsigned_integer(scan_inst->dst.type))
374                         break;
375                   }
376                }
377             } else {
378                /* Not safe to use inequality operators if the types are
379                 * different.
380                 */
381                if (scan_inst->dst.type != inst->src[0].type &&
382                    inst->conditional_mod != ELK_CONDITIONAL_Z &&
383                    inst->conditional_mod != ELK_CONDITIONAL_NZ)
384                   break;
385 
386                /* Comparisons operate differently for ints and floats */
387                if (scan_inst->dst.type != inst->dst.type) {
388                   /* Comparison result may be altered if the bit-size changes
389                    * since that affects range, denorms, etc
390                    */
391                   if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
392                      break;
393 
394                   if (elk_reg_type_is_floating_point(scan_inst->dst.type) !=
395                       elk_reg_type_is_floating_point(inst->dst.type))
396                      break;
397                }
398             }
399 
400             /* Knowing following:
401              * - CMP writes to flag register the result of
402              *   applying cmod to the `src0 - src1`.
403              *   After that it stores the same value to dst.
404              *   Other instructions first store their result to
405              *   dst, and then store cmod(dst) to the flag
406              *   register.
407              * - inst is either CMP or MOV
408              * - inst->dst is null
409              * - inst->src[0] overlaps with scan_inst->dst
410              * - inst->src[1] is zero
411              * - scan_inst wrote to a flag register
412              *
413              * There can be three possible paths:
414              *
415              * - scan_inst is CMP:
416              *
417              *   Considering that src0 is either 0x0 (false),
418              *   or 0xffffffff (true), and src1 is 0x0:
419              *
420              *   - If inst's cmod is NZ, we can always remove
421              *     scan_inst: NZ is invariant for false and true. This
422              *     holds even if src0 is NaN: .nz is the only cmod,
423              *     that returns true for NaN.
424              *
425              *   - .g is invariant if src0 has a UD type
426              *
427              *   - .l is invariant if src0 has a D type
428              *
429              * - scan_inst and inst have the same cmod:
430              *
431              *   If scan_inst is anything than CMP, it already
432              *   wrote the appropriate value to the flag register.
433              *
434              * - else:
435              *
436              *   We can change cmod of scan_inst to that of inst,
437              *   and remove inst. It is valid as long as we make
438              *   sure that no instruction uses the flag register
439              *   between scan_inst and inst.
440              */
441             if (!inst->src[0].negate &&
442                 scan_inst->flags_written(devinfo)) {
443                if (scan_inst->opcode == ELK_OPCODE_CMP) {
444                   if ((inst->conditional_mod == ELK_CONDITIONAL_NZ) ||
445                       (inst->conditional_mod == ELK_CONDITIONAL_G &&
446                        inst->src[0].type == ELK_REGISTER_TYPE_UD) ||
447                       (inst->conditional_mod == ELK_CONDITIONAL_L &&
448                        inst->src[0].type == ELK_REGISTER_TYPE_D)) {
449                      inst->remove(block, true);
450                      progress = true;
451                      break;
452                   }
453                } else if (scan_inst->conditional_mod == inst->conditional_mod) {
454                   /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
455                    * flags value is not based on the result stored in the
456                    * destination.  On all other platforms sel.cond will not
457                    * write the flags, so execution will not get to this point.
458                    */
459                   if (scan_inst->opcode == ELK_OPCODE_SEL) {
460                      assert(devinfo->ver <= 5);
461                   } else {
462                      inst->remove(block, true);
463                      progress = true;
464                   }
465 
466                   break;
467                } else if (!read_flag && scan_inst->can_do_cmod()) {
468                   scan_inst->conditional_mod = inst->conditional_mod;
469                   scan_inst->flag_subreg = inst->flag_subreg;
470                   inst->remove(block, true);
471                   progress = true;
472                   break;
473                }
474             }
475 
476             /* The conditional mod of the CMP/CMPN instructions behaves
477              * specially because the flag output is not calculated from the
478              * result of the instruction, but the other way around, which
479              * means that even if the condmod to propagate and the condmod
480              * from the CMP instruction are the same they will in general give
481              * different results because they are evaluated based on different
482              * inputs.
483              */
484             if (scan_inst->opcode == ELK_OPCODE_CMP ||
485                 scan_inst->opcode == ELK_OPCODE_CMPN)
486                break;
487 
488             /* From the Sky Lake PRM, Vol 2a, "Multiply":
489              *
490              *    "When multiplying integer data types, if one of the sources
491              *     is a DW, the resulting full precision data is stored in
492              *     the accumulator. However, if the destination data type is
493              *     either W or DW, the low bits of the result are written to
494              *     the destination register and the remaining high bits are
495              *     discarded. This results in undefined Overflow and Sign
496              *     flags. Therefore, conditional modifiers and saturation
497              *     (.sat) cannot be used in this case."
498              *
499              * We just disallow cmod propagation on all integer multiplies.
500              */
501             if (!elk_reg_type_is_floating_point(scan_inst->dst.type) &&
502                 scan_inst->opcode == ELK_OPCODE_MUL)
503                break;
504 
505             enum elk_conditional_mod cond =
506                inst->src[0].negate ? elk_swap_cmod(inst->conditional_mod)
507                                    : inst->conditional_mod;
508 
509             /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
510              *
511              *    * Note that the [post condition signal] bits generated at
512              *      the output of a compute are before the .sat.
513              *
514              * Paragraph about post_zero does not mention saturation, but
515              * testing it on actual GPUs shows that conditional modifiers are
516              * applied after saturation.
517              *
518              *    * post_zero bit: This bit reflects whether the final
519              *      result is zero after all the clamping, normalizing,
520              *      or format conversion logic.
521              *
522              * For this reason, no additional restrictions are necessary on
523              * instructions with saturate.
524              */
525 
526             /* Otherwise, try propagating the conditional. */
527             if (scan_inst->can_do_cmod() &&
528                 ((!read_flag && scan_inst->conditional_mod == ELK_CONDITIONAL_NONE) ||
529                  scan_inst->conditional_mod == cond)) {
530                scan_inst->conditional_mod = cond;
531                scan_inst->flag_subreg = inst->flag_subreg;
532                inst->remove(block, true);
533                progress = true;
534             }
535             break;
536          }
537 
538          if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
539             break;
540 
541          read_flag = read_flag ||
542                      (scan_inst->flags_read(devinfo) & flags_written) != 0;
543       }
544    }
545 
546    /* There is progress if and only if instructions were removed. */
547    assert(progress == (block->end_ip_delta != 0));
548 
549    return progress;
550 }
551 
552 bool
opt_cmod_propagation()553 elk_fs_visitor::opt_cmod_propagation()
554 {
555    bool progress = false;
556 
557    foreach_block_reverse(block, cfg) {
558       progress = opt_cmod_propagation_local(devinfo, block) || progress;
559    }
560 
561    if (progress) {
562       cfg->adjust_block_ips();
563 
564       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
565    }
566 
567    return progress;
568 }
569