• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_target.h"
25 #include "codegen/nv50_ir_build_util.h"
26 
27 extern "C" {
28 #include "util/u_math.h"
29 }
30 
31 namespace nv50_ir {
32 
33 bool
isNop() const34 Instruction::isNop() const
35 {
36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
37       return true;
38    if (terminator || join) // XXX: should terminator imply flow ?
39       return false;
40    if (op == OP_ATOM)
41       return false;
42    if (!fixed && op == OP_NOP)
43       return true;
44 
45    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
46       for (int d = 1; defExists(d); ++d)
47          if (def(d).rep()->reg.data.id >= 0)
48             WARN("part of vector result is unused !\n");
49       return true;
50    }
51 
52    if (op == OP_MOV || op == OP_UNION) {
53       if (!getDef(0)->equals(getSrc(0)))
54          return false;
55       if (op == OP_UNION)
56          if (!def(0).rep()->equals(getSrc(1)))
57             return false;
58       return true;
59    }
60 
61    return false;
62 }
63 
isDead() const64 bool Instruction::isDead() const
65 {
66    if (op == OP_STORE ||
67        op == OP_EXPORT ||
68        op == OP_ATOM ||
69        op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
70        op == OP_WRSV)
71       return false;
72 
73    for (int d = 0; defExists(d); ++d)
74       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
75          return false;
76 
77    if (terminator || asFlow())
78       return false;
79    if (fixed)
80       return false;
81 
82    return true;
83 };
84 
85 // =============================================================================
86 
87 class CopyPropagation : public Pass
88 {
89 private:
90    virtual bool visit(BasicBlock *);
91 };
92 
93 // Propagate all MOVs forward to make subsequent optimization easier, except if
94 // the sources stem from a phi, in which case we don't want to mess up potential
95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
96 bool
visit(BasicBlock * bb)97 CopyPropagation::visit(BasicBlock *bb)
98 {
99    Instruction *mov, *si, *next;
100 
101    for (mov = bb->getEntry(); mov; mov = next) {
102       next = mov->next;
103       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
104          continue;
105       if (mov->getPredicate())
106          continue;
107       if (mov->def(0).getFile() != mov->src(0).getFile())
108          continue;
109       si = mov->getSrc(0)->getInsn();
110       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
111          // propagate
112          mov->def(0).replace(mov->getSrc(0), false);
113          delete_Instruction(prog, mov);
114       }
115    }
116    return true;
117 }
118 
119 // =============================================================================
120 
121 class MergeSplits : public Pass
122 {
123 private:
124    virtual bool visit(BasicBlock *);
125 };
126 
127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
128 // post-merge def with the SPLIT's source.
129 bool
visit(BasicBlock * bb)130 MergeSplits::visit(BasicBlock *bb)
131 {
132    Instruction *i, *next, *si;
133 
134    for (i = bb->getEntry(); i; i = next) {
135       next = i->next;
136       if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
137          continue;
138       si = i->getSrc(0)->getInsn();
139       if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
140          continue;
141       i->def(0).replace(si->getSrc(0), false);
142       delete_Instruction(prog, i);
143    }
144 
145    return true;
146 }
147 
148 // =============================================================================
149 
150 class LoadPropagation : public Pass
151 {
152 private:
153    virtual bool visit(BasicBlock *);
154 
155    void checkSwapSrc01(Instruction *);
156 
157    bool isCSpaceLoad(Instruction *);
158    bool isImmdLoad(Instruction *);
159    bool isAttribOrSharedLoad(Instruction *);
160 };
161 
162 bool
isCSpaceLoad(Instruction * ld)163 LoadPropagation::isCSpaceLoad(Instruction *ld)
164 {
165    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
166 }
167 
168 bool
isImmdLoad(Instruction * ld)169 LoadPropagation::isImmdLoad(Instruction *ld)
170 {
171    if (!ld || (ld->op != OP_MOV) ||
172        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
173       return false;
174 
175    // A 0 can be replaced with a register, so it doesn't count as an immediate.
176    ImmediateValue val;
177    return ld->src(0).getImmediate(val) && !val.isInteger(0);
178 }
179 
180 bool
isAttribOrSharedLoad(Instruction * ld)181 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
182 {
183    return ld &&
184       (ld->op == OP_VFETCH ||
185        (ld->op == OP_LOAD &&
186         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
187          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
188 }
189 
190 void
checkSwapSrc01(Instruction * insn)191 LoadPropagation::checkSwapSrc01(Instruction *insn)
192 {
193    const Target *targ = prog->getTarget();
194    if (!targ->getOpInfo(insn).commutative)
195       if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
196          return;
197    if (insn->src(1).getFile() != FILE_GPR)
198       return;
199    // This is the special OP_SET used for alphatesting, we can't reverse its
200    // arguments as that will confuse the fixup code.
201    if (insn->op == OP_SET && insn->subOp)
202       return;
203 
204    Instruction *i0 = insn->getSrc(0)->getInsn();
205    Instruction *i1 = insn->getSrc(1)->getInsn();
206 
207    // Swap sources to inline the less frequently used source. That way,
208    // optimistically, it will eventually be able to remove the instruction.
209    int i0refs = insn->getSrc(0)->refCount();
210    int i1refs = insn->getSrc(1)->refCount();
211 
212    if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
213       if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
214           !targ->insnCanLoad(insn, 1, i1) ||
215           i0refs < i1refs)
216          insn->swapSources(0, 1);
217       else
218          return;
219    } else
220    if (isAttribOrSharedLoad(i1)) {
221       if (!isAttribOrSharedLoad(i0))
222          insn->swapSources(0, 1);
223       else
224          return;
225    } else {
226       return;
227    }
228 
229    if (insn->op == OP_SET || insn->op == OP_SET_AND ||
230        insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
231       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
232    else
233    if (insn->op == OP_SLCT)
234       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
235    else
236    if (insn->op == OP_SUB) {
237       insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
238       insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
239    }
240 }
241 
242 bool
visit(BasicBlock * bb)243 LoadPropagation::visit(BasicBlock *bb)
244 {
245    const Target *targ = prog->getTarget();
246    Instruction *next;
247 
248    for (Instruction *i = bb->getEntry(); i; i = next) {
249       next = i->next;
250 
251       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
252          continue;
253 
254       if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
255          continue;
256 
257       if (i->srcExists(1))
258          checkSwapSrc01(i);
259 
260       for (int s = 0; i->srcExists(s); ++s) {
261          Instruction *ld = i->getSrc(s)->getInsn();
262 
263          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
264             continue;
265          if (!targ->insnCanLoad(i, s, ld))
266             continue;
267 
268          // propagate !
269          i->setSrc(s, ld->getSrc(0));
270          if (ld->src(0).isIndirect(0))
271             i->setIndirect(s, 0, ld->getIndirect(0, 0));
272 
273          if (ld->getDef(0)->refCount() == 0)
274             delete_Instruction(prog, ld);
275       }
276    }
277    return true;
278 }
279 
280 // =============================================================================
281 
282 class IndirectPropagation : public Pass
283 {
284 private:
285    virtual bool visit(BasicBlock *);
286 };
287 
288 bool
visit(BasicBlock * bb)289 IndirectPropagation::visit(BasicBlock *bb)
290 {
291    const Target *targ = prog->getTarget();
292    Instruction *next;
293 
294    for (Instruction *i = bb->getEntry(); i; i = next) {
295       next = i->next;
296 
297       for (int s = 0; i->srcExists(s); ++s) {
298          Instruction *insn;
299          ImmediateValue imm;
300          if (!i->src(s).isIndirect(0))
301             continue;
302          insn = i->getIndirect(s, 0)->getInsn();
303          if (!insn)
304             continue;
305          if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
306             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
307                 !insn->src(1).getImmediate(imm) ||
308                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
309                continue;
310             i->setIndirect(s, 0, insn->getSrc(0));
311             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
312             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
313          } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
314             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
315                 !insn->src(1).getImmediate(imm) ||
316                 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
317                continue;
318             i->setIndirect(s, 0, insn->getSrc(0));
319             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
320             i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
321          } else if (insn->op == OP_MOV) {
322             if (!insn->src(0).getImmediate(imm) ||
323                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
324                continue;
325             i->setIndirect(s, 0, NULL);
326             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
327             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
328          }
329       }
330    }
331    return true;
332 }
333 
334 // =============================================================================
335 
336 // Evaluate constant expressions.
337 class ConstantFolding : public Pass
338 {
339 public:
340    bool foldAll(Program *);
341 
342 private:
343    virtual bool visit(BasicBlock *);
344 
345    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
346    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
347    void opnd(Instruction *, ImmediateValue&, int s);
348    void opnd3(Instruction *, ImmediateValue&);
349 
350    void unary(Instruction *, const ImmediateValue&);
351 
352    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
353 
354    CmpInstruction *findOriginForTestWithZero(Value *);
355 
356    unsigned int foldCount;
357 
358    BuildUtil bld;
359 };
360 
361 // TODO: remember generated immediates and only revisit these
362 bool
foldAll(Program * prog)363 ConstantFolding::foldAll(Program *prog)
364 {
365    unsigned int iterCount = 0;
366    do {
367       foldCount = 0;
368       if (!run(prog))
369          return false;
370    } while (foldCount && ++iterCount < 2);
371    return true;
372 }
373 
374 bool
visit(BasicBlock * bb)375 ConstantFolding::visit(BasicBlock *bb)
376 {
377    Instruction *i, *next;
378 
379    for (i = bb->getEntry(); i; i = next) {
380       next = i->next;
381       if (i->op == OP_MOV || i->op == OP_CALL)
382          continue;
383 
384       ImmediateValue src0, src1, src2;
385 
386       if (i->srcExists(2) &&
387           i->src(0).getImmediate(src0) &&
388           i->src(1).getImmediate(src1) &&
389           i->src(2).getImmediate(src2))
390          expr(i, src0, src1, src2);
391       else
392       if (i->srcExists(1) &&
393           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
394          expr(i, src0, src1);
395       else
396       if (i->srcExists(0) && i->src(0).getImmediate(src0))
397          opnd(i, src0, 0);
398       else
399       if (i->srcExists(1) && i->src(1).getImmediate(src1))
400          opnd(i, src1, 1);
401       if (i->srcExists(2) && i->src(2).getImmediate(src2))
402          opnd3(i, src2);
403    }
404    return true;
405 }
406 
407 CmpInstruction *
findOriginForTestWithZero(Value * value)408 ConstantFolding::findOriginForTestWithZero(Value *value)
409 {
410    if (!value)
411       return NULL;
412    Instruction *insn = value->getInsn();
413 
414    if (insn->asCmp() && insn->op != OP_SLCT)
415       return insn->asCmp();
416 
417    /* Sometimes mov's will sneak in as a result of other folding. This gets
418     * cleaned up later.
419     */
420    if (insn->op == OP_MOV)
421       return findOriginForTestWithZero(insn->getSrc(0));
422 
423    /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
424    if (insn->op == OP_AND) {
425       int s = 0;
426       ImmediateValue imm;
427       if (!insn->src(s).getImmediate(imm)) {
428          s = 1;
429          if (!insn->src(s).getImmediate(imm))
430             return NULL;
431       }
432       if (imm.reg.data.f32 != 1.0f)
433          return NULL;
434       /* TODO: Come up with a way to handle the condition being inverted */
435       if (insn->src(!s).mod != Modifier(0))
436          return NULL;
437       return findOriginForTestWithZero(insn->getSrc(!s));
438    }
439 
440    return NULL;
441 }
442 
443 void
applyTo(ImmediateValue & imm) const444 Modifier::applyTo(ImmediateValue& imm) const
445 {
446    if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
447       return;
448    switch (imm.reg.type) {
449    case TYPE_F32:
450       if (bits & NV50_IR_MOD_ABS)
451          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
452       if (bits & NV50_IR_MOD_NEG)
453          imm.reg.data.f32 = -imm.reg.data.f32;
454       if (bits & NV50_IR_MOD_SAT) {
455          if (imm.reg.data.f32 < 0.0f)
456             imm.reg.data.f32 = 0.0f;
457          else
458          if (imm.reg.data.f32 > 1.0f)
459             imm.reg.data.f32 = 1.0f;
460       }
461       assert(!(bits & NV50_IR_MOD_NOT));
462       break;
463 
464    case TYPE_S8: // NOTE: will be extended
465    case TYPE_S16:
466    case TYPE_S32:
467    case TYPE_U8: // NOTE: treated as signed
468    case TYPE_U16:
469    case TYPE_U32:
470       if (bits & NV50_IR_MOD_ABS)
471          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
472             imm.reg.data.s32 : -imm.reg.data.s32;
473       if (bits & NV50_IR_MOD_NEG)
474          imm.reg.data.s32 = -imm.reg.data.s32;
475       if (bits & NV50_IR_MOD_NOT)
476          imm.reg.data.s32 = ~imm.reg.data.s32;
477       break;
478 
479    case TYPE_F64:
480       if (bits & NV50_IR_MOD_ABS)
481          imm.reg.data.f64 = fabs(imm.reg.data.f64);
482       if (bits & NV50_IR_MOD_NEG)
483          imm.reg.data.f64 = -imm.reg.data.f64;
484       if (bits & NV50_IR_MOD_SAT) {
485          if (imm.reg.data.f64 < 0.0)
486             imm.reg.data.f64 = 0.0;
487          else
488          if (imm.reg.data.f64 > 1.0)
489             imm.reg.data.f64 = 1.0;
490       }
491       assert(!(bits & NV50_IR_MOD_NOT));
492       break;
493 
494    default:
495       assert(!"invalid/unhandled type");
496       imm.reg.data.u64 = 0;
497       break;
498    }
499 }
500 
501 operation
getOp() const502 Modifier::getOp() const
503 {
504    switch (bits) {
505    case NV50_IR_MOD_ABS: return OP_ABS;
506    case NV50_IR_MOD_NEG: return OP_NEG;
507    case NV50_IR_MOD_SAT: return OP_SAT;
508    case NV50_IR_MOD_NOT: return OP_NOT;
509    case 0:
510       return OP_MOV;
511    default:
512       return OP_CVT;
513    }
514 }
515 
516 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1)517 ConstantFolding::expr(Instruction *i,
518                       ImmediateValue &imm0, ImmediateValue &imm1)
519 {
520    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
521    struct Storage res;
522    DataType type = i->dType;
523 
524    memset(&res.data, 0, sizeof(res.data));
525 
526    switch (i->op) {
527    case OP_MAD:
528    case OP_FMA:
529    case OP_MUL:
530       if (i->dnz && i->dType == TYPE_F32) {
531          if (!isfinite(a->data.f32))
532             a->data.f32 = 0.0f;
533          if (!isfinite(b->data.f32))
534             b->data.f32 = 0.0f;
535       }
536       switch (i->dType) {
537       case TYPE_F32:
538          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
539          break;
540       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
541       case TYPE_S32:
542          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
543             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
544             break;
545          }
546          /* fallthrough */
547       case TYPE_U32:
548          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
549             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
550             break;
551          }
552          res.data.u32 = a->data.u32 * b->data.u32; break;
553       default:
554          return;
555       }
556       break;
557    case OP_DIV:
558       if (b->data.u32 == 0)
559          break;
560       switch (i->dType) {
561       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
562       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
563       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
564       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
565       default:
566          return;
567       }
568       break;
569    case OP_ADD:
570       switch (i->dType) {
571       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
572       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
573       case TYPE_S32:
574       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
575       default:
576          return;
577       }
578       break;
579    case OP_SUB:
580       switch (i->dType) {
581       case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
582       case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
583       case TYPE_S32:
584       case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
585       default:
586          return;
587       }
588       break;
589    case OP_POW:
590       switch (i->dType) {
591       case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
592       case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
593       default:
594          return;
595       }
596       break;
597    case OP_MAX:
598       switch (i->dType) {
599       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
600       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
601       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
602       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
603       default:
604          return;
605       }
606       break;
607    case OP_MIN:
608       switch (i->dType) {
609       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
610       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
611       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
612       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
613       default:
614          return;
615       }
616       break;
617    case OP_AND:
618       res.data.u64 = a->data.u64 & b->data.u64;
619       break;
620    case OP_OR:
621       res.data.u64 = a->data.u64 | b->data.u64;
622       break;
623    case OP_XOR:
624       res.data.u64 = a->data.u64 ^ b->data.u64;
625       break;
626    case OP_SHL:
627       res.data.u32 = a->data.u32 << b->data.u32;
628       break;
629    case OP_SHR:
630       switch (i->dType) {
631       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
632       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
633       default:
634          return;
635       }
636       break;
637    case OP_SLCT:
638       if (a->data.u32 != b->data.u32)
639          return;
640       res.data.u32 = a->data.u32;
641       break;
642    case OP_EXTBF: {
643       int offset = b->data.u32 & 0xff;
644       int width = (b->data.u32 >> 8) & 0xff;
645       int rshift = offset;
646       int lshift = 0;
647       if (width == 0) {
648          res.data.u32 = 0;
649          break;
650       }
651       if (width + offset < 32) {
652          rshift = 32 - width;
653          lshift = 32 - width - offset;
654       }
655       if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
656          res.data.u32 = util_bitreverse(a->data.u32);
657       else
658          res.data.u32 = a->data.u32;
659       switch (i->dType) {
660       case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
661       case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
662       default:
663          return;
664       }
665       break;
666    }
667    case OP_POPCNT:
668       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
669       break;
670    case OP_PFETCH:
671       // The two arguments to pfetch are logically added together. Normally
672       // the second argument will not be constant, but that can happen.
673       res.data.u32 = a->data.u32 + b->data.u32;
674       type = TYPE_U32;
675       break;
676    case OP_MERGE:
677       switch (i->dType) {
678       case TYPE_U64:
679       case TYPE_S64:
680       case TYPE_F64:
681          res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
682          break;
683       default:
684          return;
685       }
686       break;
687    default:
688       return;
689    }
690    ++foldCount;
691 
692    i->src(0).mod = Modifier(0);
693    i->src(1).mod = Modifier(0);
694    i->postFactor = 0;
695 
696    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
697    i->setSrc(1, NULL);
698 
699    i->getSrc(0)->reg.data = res.data;
700    i->getSrc(0)->reg.type = type;
701    i->getSrc(0)->reg.size = typeSizeof(type);
702 
703    switch (i->op) {
704    case OP_MAD:
705    case OP_FMA: {
706       ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
707 
708       // Move the immediate into position 1, where we know it might be
709       // emittable. However it might not be anyways, as there may be other
710       // restrictions, so move it into a separate LValue.
711       bld.setPosition(i, false);
712       i->op = OP_ADD;
713       i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
714       i->setSrc(0, i->getSrc(2));
715       i->src(0).mod = i->src(2).mod;
716       i->setSrc(2, NULL);
717 
718       if (i->src(0).getImmediate(src0))
719          expr(i, src0, src1);
720       else
721          opnd(i, src1, 1);
722       break;
723    }
724    case OP_PFETCH:
725       // Leave PFETCH alone... we just folded its 2 args into 1.
726       break;
727    default:
728       i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
729       break;
730    }
731    i->subOp = 0;
732 }
733 
734 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1,ImmediateValue & imm2)735 ConstantFolding::expr(Instruction *i,
736                       ImmediateValue &imm0,
737                       ImmediateValue &imm1,
738                       ImmediateValue &imm2)
739 {
740    struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
741    struct Storage res;
742 
743    memset(&res.data, 0, sizeof(res.data));
744 
745    switch (i->op) {
746    case OP_INSBF: {
747       int offset = b->data.u32 & 0xff;
748       int width = (b->data.u32 >> 8) & 0xff;
749       unsigned bitmask = ((1 << width) - 1) << offset;
750       res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
751       break;
752    }
753    case OP_MAD:
754    case OP_FMA: {
755       switch (i->dType) {
756       case TYPE_F32:
757          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
758             c->data.f32;
759          break;
760       case TYPE_F64:
761          res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
762          break;
763       case TYPE_S32:
764          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
765             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
766             break;
767          }
768          /* fallthrough */
769       case TYPE_U32:
770          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
771             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
772             break;
773          }
774          res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
775          break;
776       default:
777          return;
778       }
779       break;
780    }
781    case OP_SHLADD:
782       res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
783       break;
784    default:
785       return;
786    }
787 
788    ++foldCount;
789    i->src(0).mod = Modifier(0);
790    i->src(1).mod = Modifier(0);
791    i->src(2).mod = Modifier(0);
792 
793    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
794    i->setSrc(1, NULL);
795    i->setSrc(2, NULL);
796 
797    i->getSrc(0)->reg.data = res.data;
798    i->getSrc(0)->reg.type = i->dType;
799    i->getSrc(0)->reg.size = typeSizeof(i->dType);
800 
801    i->op = OP_MOV;
802 }
803 
804 void
unary(Instruction * i,const ImmediateValue & imm)805 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
806 {
807    Storage res;
808 
809    if (i->dType != TYPE_F32)
810       return;
811    switch (i->op) {
812    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
813    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
814    case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
815    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
816    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
817    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
818    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
819    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
820    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
821    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
822    case OP_PRESIN:
823    case OP_PREEX2:
824       // these should be handled in subsequent OP_SIN/COS/EX2
825       res.data.f32 = imm.reg.data.f32;
826       break;
827    default:
828       return;
829    }
830    i->op = OP_MOV;
831    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
832    i->src(0).mod = Modifier(0);
833 }
834 
835 void
tryCollapseChainedMULs(Instruction * mul2,const int s,ImmediateValue & imm2)836 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
837                                         const int s, ImmediateValue& imm2)
838 {
839    const int t = s ? 0 : 1;
840    Instruction *insn;
841    Instruction *mul1 = NULL; // mul1 before mul2
842    int e = 0;
843    float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
844    ImmediateValue imm1;
845 
846    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
847 
848    if (mul2->getSrc(t)->refCount() == 1) {
849       insn = mul2->getSrc(t)->getInsn();
850       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
851          mul1 = insn;
852       if (mul1 && !mul1->saturate) {
853          int s1;
854 
855          if (mul1->src(s1 = 0).getImmediate(imm1) ||
856              mul1->src(s1 = 1).getImmediate(imm1)) {
857             bld.setPosition(mul1, false);
858             // a = mul r, imm1
859             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
860             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
861             mul1->src(s1).mod = Modifier(0);
862             mul2->def(0).replace(mul1->getDef(0), false);
863             mul1->saturate = mul2->saturate;
864          } else
865          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
866             // c = mul a, b
867             // d = mul c, imm   -> d = mul_x_imm a, b
868             mul1->postFactor = e;
869             mul2->def(0).replace(mul1->getDef(0), false);
870             if (f < 0)
871                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
872             mul1->saturate = mul2->saturate;
873          }
874          return;
875       }
876    }
877    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
878       // b = mul a, imm
879       // d = mul b, c   -> d = mul_x_imm a, c
880       int s2, t2;
881       insn = (*mul2->getDef(0)->uses.begin())->getInsn();
882       if (!insn)
883          return;
884       mul1 = mul2;
885       mul2 = NULL;
886       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
887       t2 = s2 ? 0 : 1;
888       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
889          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
890             mul2 = insn;
891       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
892          mul2->postFactor = e;
893          mul2->setSrc(s2, mul1->src(t));
894          if (f < 0)
895             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
896       }
897    }
898 }
899 
900 void
opnd3(Instruction * i,ImmediateValue & imm2)901 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
902 {
903    switch (i->op) {
904    case OP_MAD:
905    case OP_FMA:
906       if (imm2.isInteger(0)) {
907          i->op = OP_MUL;
908          i->setSrc(2, NULL);
909          foldCount++;
910          return;
911       }
912       break;
913    case OP_SHLADD:
914       if (imm2.isInteger(0)) {
915          i->op = OP_SHL;
916          i->setSrc(2, NULL);
917          foldCount++;
918          return;
919       }
920       break;
921    default:
922       return;
923    }
924 }
925 
926 void
opnd(Instruction * i,ImmediateValue & imm0,int s)927 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
928 {
929    const Target *target = prog->getTarget();
930    const int t = !s;
931    const operation op = i->op;
932    Instruction *newi = i;
933 
934    switch (i->op) {
935    case OP_SPLIT: {
936       bld.setPosition(i, false);
937 
938       uint8_t size = i->getDef(0)->reg.size;
939       uint32_t mask = (1ULL << size) - 1;
940       assert(size <= 32);
941 
942       uint64_t val = imm0.reg.data.u64;
943       for (int8_t d = 0; i->defExists(d); ++d) {
944          Value *def = i->getDef(d);
945          assert(def->reg.size == size);
946 
947          newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32);
948          val >>= size;
949       }
950       delete_Instruction(prog, i);
951       break;
952    }
953    case OP_MUL:
954       if (i->dType == TYPE_F32)
955          tryCollapseChainedMULs(i, s, imm0);
956 
957       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
958          assert(!isFloatType(i->sType));
959          if (imm0.isInteger(1) && i->dType == TYPE_S32) {
960             bld.setPosition(i, false);
961             // Need to set to the sign value, which is a compare.
962             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
963                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
964             delete_Instruction(prog, i);
965          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
966             // The high bits can't be set in this case (either mul by 0 or
967             // unsigned by 1)
968             i->op = OP_MOV;
969             i->subOp = 0;
970             i->setSrc(0, new_ImmediateValue(prog, 0u));
971             i->src(0).mod = Modifier(0);
972             i->setSrc(1, NULL);
973          } else if (!imm0.isNegative() && imm0.isPow2()) {
974             // Translate into a shift
975             imm0.applyLog2();
976             i->op = OP_SHR;
977             i->subOp = 0;
978             imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
979             i->setSrc(0, i->getSrc(t));
980             i->src(0).mod = i->src(t).mod;
981             i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
982             i->src(1).mod = 0;
983          }
984       } else
985       if (imm0.isInteger(0)) {
986          i->op = OP_MOV;
987          i->setSrc(0, new_ImmediateValue(prog, 0u));
988          i->src(0).mod = Modifier(0);
989          i->postFactor = 0;
990          i->setSrc(1, NULL);
991       } else
992       if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
993          if (imm0.isNegative())
994             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
995          i->op = i->src(t).mod.getOp();
996          if (s == 0) {
997             i->setSrc(0, i->getSrc(1));
998             i->src(0).mod = i->src(1).mod;
999             i->src(1).mod = 0;
1000          }
1001          if (i->op != OP_CVT)
1002             i->src(0).mod = 0;
1003          i->setSrc(1, NULL);
1004       } else
1005       if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
1006          if (imm0.isNegative())
1007             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1008          i->op = OP_ADD;
1009          i->setSrc(s, i->getSrc(t));
1010          i->src(s).mod = i->src(t).mod;
1011       } else
1012       if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
1013          i->op = OP_SHL;
1014          imm0.applyLog2();
1015          i->setSrc(0, i->getSrc(t));
1016          i->src(0).mod = i->src(t).mod;
1017          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1018          i->src(1).mod = 0;
1019       } else
1020       if (i->postFactor && i->sType == TYPE_F32) {
1021          /* Can't emit a postfactor with an immediate, have to fold it in */
1022          i->setSrc(s, new_ImmediateValue(
1023                       prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
1024          i->postFactor = 0;
1025       }
1026       break;
1027    case OP_MAD:
1028       if (imm0.isInteger(0)) {
1029          i->setSrc(0, i->getSrc(2));
1030          i->src(0).mod = i->src(2).mod;
1031          i->setSrc(1, NULL);
1032          i->setSrc(2, NULL);
1033          i->op = i->src(0).mod.getOp();
1034          if (i->op != OP_CVT)
1035             i->src(0).mod = 0;
1036       } else
1037       if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
1038           (imm0.isInteger(1) || imm0.isInteger(-1))) {
1039          if (imm0.isNegative())
1040             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1041          if (s == 0) {
1042             i->setSrc(0, i->getSrc(1));
1043             i->src(0).mod = i->src(1).mod;
1044          }
1045          i->setSrc(1, i->getSrc(2));
1046          i->src(1).mod = i->src(2).mod;
1047          i->setSrc(2, NULL);
1048          i->op = OP_ADD;
1049       } else
1050       if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
1051           target->isOpSupported(OP_SHLADD, i->dType)) {
1052          i->op = OP_SHLADD;
1053          imm0.applyLog2();
1054          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1055       }
1056       break;
1057    case OP_ADD:
1058    case OP_SUB:
1059       if (i->usesFlags())
1060          break;
1061       if (imm0.isInteger(0)) {
1062          if (s == 0) {
1063             i->setSrc(0, i->getSrc(1));
1064             i->src(0).mod = i->src(1).mod;
1065             if (i->op == OP_SUB)
1066                i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1067          }
1068          i->setSrc(1, NULL);
1069          i->op = i->src(0).mod.getOp();
1070          if (i->op != OP_CVT)
1071             i->src(0).mod = Modifier(0);
1072       }
1073       break;
1074 
1075    case OP_DIV:
1076       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
1077          break;
1078       bld.setPosition(i, false);
1079       if (imm0.reg.data.u32 == 0) {
1080          break;
1081       } else
1082       if (imm0.reg.data.u32 == 1) {
1083          i->op = OP_MOV;
1084          i->setSrc(1, NULL);
1085       } else
1086       if (i->dType == TYPE_U32 && imm0.isPow2()) {
1087          i->op = OP_SHR;
1088          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1089       } else
1090       if (i->dType == TYPE_U32) {
1091          Instruction *mul;
1092          Value *tA, *tB;
1093          const uint32_t d = imm0.reg.data.u32;
1094          uint32_t m;
1095          int r, s;
1096          uint32_t l = util_logbase2(d);
1097          if (((uint32_t)1 << l) < d)
1098             ++l;
1099          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1100          r = l ? 1 : 0;
1101          s = l ? (l - 1) : 0;
1102 
1103          tA = bld.getSSA();
1104          tB = bld.getSSA();
1105          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1106                          bld.loadImm(NULL, m));
1107          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1108          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1109          tA = bld.getSSA();
1110          if (r)
1111             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1112          else
1113             tA = tB;
1114          tB = s ? bld.getSSA() : i->getDef(0);
1115          newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1116          if (s)
1117             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1118 
1119          delete_Instruction(prog, i);
1120       } else
1121       if (imm0.reg.data.s32 == -1) {
1122          i->op = OP_NEG;
1123          i->setSrc(1, NULL);
1124       } else {
1125          LValue *tA, *tB;
1126          LValue *tD;
1127          const int32_t d = imm0.reg.data.s32;
1128          int32_t m;
1129          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1130          if ((1 << l) < abs(d))
1131             ++l;
1132          if (!l)
1133             l = 1;
1134          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1135 
1136          tA = bld.getSSA();
1137          tB = bld.getSSA();
1138          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1139                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1140          if (l > 1)
1141             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1142          else
1143             tB = tA;
1144          tA = bld.getSSA();
1145          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1146          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1147          newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1148          if (d < 0)
1149             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1150 
1151          delete_Instruction(prog, i);
1152       }
1153       break;
1154 
1155    case OP_MOD:
1156       if (i->sType == TYPE_U32 && imm0.isPow2()) {
1157          bld.setPosition(i, false);
1158          i->op = OP_AND;
1159          i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1160       }
1161       break;
1162 
1163    case OP_SET: // TODO: SET_AND,OR,XOR
1164    {
1165       /* This optimizes the case where the output of a set is being compared
1166        * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1167        * can be a lot cleverer in our comparison.
1168        */
1169       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1170       CondCode cc, ccZ;
1171       if (imm0.reg.data.u32 != 0 || !si)
1172          return;
1173       cc = si->setCond;
1174       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1175       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1176       // first.
1177       if (s == 0)
1178          ccZ = reverseCondCode(ccZ);
1179       // If there is a negative modifier, we need to undo that, by flipping
1180       // the comparison to zero.
1181       if (i->src(t).mod.neg())
1182          ccZ = reverseCondCode(ccZ);
1183       // If this is a signed comparison, we expect the input to be a regular
1184       // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1185       // is positive, so just flip the sign.
1186       if (i->sType == TYPE_S32) {
1187          assert(!isFloatType(si->dType));
1188          ccZ = reverseCondCode(ccZ);
1189       }
1190       switch (ccZ) {
1191       case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1192       case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1193       case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1194       case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1195       case CC_GT: break; // bool > 0 -- bool
1196       case CC_NE: break; // bool != 0 -- bool
1197       default:
1198          return;
1199       }
1200 
1201       // Update the condition of this SET to be identical to the origin set,
1202       // but with the updated condition code. The original SET should get
1203       // DCE'd, ideally.
1204       i->op = si->op;
1205       i->asCmp()->setCond = cc;
1206       i->setSrc(0, si->src(0));
1207       i->setSrc(1, si->src(1));
1208       if (si->srcExists(2))
1209          i->setSrc(2, si->src(2));
1210       i->sType = si->sType;
1211    }
1212       break;
1213 
1214    case OP_AND:
1215    {
1216       Instruction *src = i->getSrc(t)->getInsn();
1217       ImmediateValue imm1;
1218       if (imm0.reg.data.u32 == 0) {
1219          i->op = OP_MOV;
1220          i->setSrc(0, new_ImmediateValue(prog, 0u));
1221          i->src(0).mod = Modifier(0);
1222          i->setSrc(1, NULL);
1223       } else if (imm0.reg.data.u32 == ~0U) {
1224          i->op = i->src(t).mod.getOp();
1225          if (t) {
1226             i->setSrc(0, i->getSrc(t));
1227             i->src(0).mod = i->src(t).mod;
1228          }
1229          i->setSrc(1, NULL);
1230       } else if (src->asCmp()) {
1231          CmpInstruction *cmp = src->asCmp();
1232          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1233             return;
1234          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1235             return;
1236          if (imm0.reg.data.f32 != 1.0)
1237             return;
1238          if (cmp->dType != TYPE_U32)
1239             return;
1240 
1241          cmp->dType = TYPE_F32;
1242          if (i->src(t).mod != Modifier(0)) {
1243             assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1244             i->src(t).mod = Modifier(0);
1245             cmp->setCond = inverseCondCode(cmp->setCond);
1246          }
1247          i->op = OP_MOV;
1248          i->setSrc(s, NULL);
1249          if (t) {
1250             i->setSrc(0, i->getSrc(t));
1251             i->setSrc(t, NULL);
1252          }
1253       } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1254                  src->op == OP_SHR &&
1255                  src->src(1).getImmediate(imm1) &&
1256                  i->src(t).mod == Modifier(0) &&
1257                  util_is_power_of_two(imm0.reg.data.u32 + 1)) {
1258          // low byte = offset, high byte = width
1259          uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1260          i->op = OP_EXTBF;
1261          i->setSrc(0, src->getSrc(0));
1262          i->setSrc(1, new_ImmediateValue(prog, ext));
1263       } else if (src->op == OP_SHL &&
1264                  src->src(1).getImmediate(imm1) &&
1265                  i->src(t).mod == Modifier(0) &&
1266                  util_is_power_of_two(~imm0.reg.data.u32 + 1) &&
1267                  util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
1268          i->op = OP_MOV;
1269          i->setSrc(s, NULL);
1270          if (t) {
1271             i->setSrc(0, i->getSrc(t));
1272             i->setSrc(t, NULL);
1273          }
1274       }
1275    }
1276       break;
1277 
1278    case OP_SHL:
1279    {
1280       if (s != 1 || i->src(0).mod != Modifier(0))
1281          break;
1282       // try to concatenate shifts
1283       Instruction *si = i->getSrc(0)->getInsn();
1284       if (!si)
1285          break;
1286       ImmediateValue imm1;
1287       switch (si->op) {
1288       case OP_SHL:
1289          if (si->src(1).getImmediate(imm1)) {
1290             bld.setPosition(i, false);
1291             i->setSrc(0, si->getSrc(0));
1292             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1293          }
1294          break;
1295       case OP_SHR:
1296          if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
1297             bld.setPosition(i, false);
1298             i->op = OP_AND;
1299             i->setSrc(0, si->getSrc(0));
1300             i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
1301          }
1302          break;
1303       case OP_MUL:
1304          int muls;
1305          if (isFloatType(si->dType))
1306             return;
1307          if (si->src(1).getImmediate(imm1))
1308             muls = 1;
1309          else if (si->src(0).getImmediate(imm1))
1310             muls = 0;
1311          else
1312             return;
1313 
1314          bld.setPosition(i, false);
1315          i->op = OP_MUL;
1316          i->setSrc(0, si->getSrc(!muls));
1317          i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1318          break;
1319       case OP_SUB:
1320       case OP_ADD:
1321          int adds;
1322          if (isFloatType(si->dType))
1323             return;
1324          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1325             adds = 0;
1326          else if (si->src(1).getImmediate(imm1))
1327             adds = 1;
1328          else
1329             return;
1330          if (si->src(!adds).mod != Modifier(0))
1331             return;
1332          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1333 
1334          // This is more operations, but if one of x, y is an immediate, then
1335          // we can get a situation where (a) we can use ISCADD, or (b)
1336          // propagate the add bit into an indirect load.
1337          bld.setPosition(i, false);
1338          i->op = si->op;
1339          i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1340          i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1341                                      bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1342                                      si->getSrc(!adds),
1343                                      bld.mkImm(imm0.reg.data.u32)));
1344          break;
1345       default:
1346          return;
1347       }
1348    }
1349       break;
1350 
1351    case OP_ABS:
1352    case OP_NEG:
1353    case OP_SAT:
1354    case OP_LG2:
1355    case OP_RCP:
1356    case OP_SQRT:
1357    case OP_RSQ:
1358    case OP_PRESIN:
1359    case OP_SIN:
1360    case OP_COS:
1361    case OP_PREEX2:
1362    case OP_EX2:
1363       unary(i, imm0);
1364       break;
1365    case OP_BFIND: {
1366       int32_t res;
1367       switch (i->dType) {
1368       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1369       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1370       default:
1371          return;
1372       }
1373       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1374          res = 31 - res;
1375       bld.setPosition(i, false); /* make sure bld is init'ed */
1376       i->setSrc(0, bld.mkImm(res));
1377       i->setSrc(1, NULL);
1378       i->op = OP_MOV;
1379       i->subOp = 0;
1380       break;
1381    }
1382    case OP_POPCNT: {
1383       // Only deal with 1-arg POPCNT here
1384       if (i->srcExists(1))
1385          break;
1386       uint32_t res = util_bitcount(imm0.reg.data.u32);
1387       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1388       i->setSrc(1, NULL);
1389       i->op = OP_MOV;
1390       break;
1391    }
1392    case OP_CVT: {
1393       Storage res;
1394 
1395       // TODO: handle 64-bit values properly
1396       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1397          return;
1398 
1399       // TODO: handle single byte/word extractions
1400       if (i->subOp)
1401          return;
1402 
1403       bld.setPosition(i, true); /* make sure bld is init'ed */
1404 
1405 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1406    case type: \
1407       switch (i->sType) { \
1408       case TYPE_F64: \
1409          res.data.dst = util_iround(i->saturate ? \
1410                                     CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1411                                     imm0.reg.data.f64); \
1412          break; \
1413       case TYPE_F32: \
1414          res.data.dst = util_iround(i->saturate ? \
1415                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1416                                     imm0.reg.data.f32); \
1417          break; \
1418       case TYPE_S32: \
1419          res.data.dst = i->saturate ? \
1420                         CLAMP(imm0.reg.data.s32, imin, imax) : \
1421                         imm0.reg.data.s32; \
1422          break; \
1423       case TYPE_U32: \
1424          res.data.dst = i->saturate ? \
1425                         CLAMP(imm0.reg.data.u32, umin, umax) : \
1426                         imm0.reg.data.u32; \
1427          break; \
1428       case TYPE_S16: \
1429          res.data.dst = i->saturate ? \
1430                         CLAMP(imm0.reg.data.s16, imin, imax) : \
1431                         imm0.reg.data.s16; \
1432          break; \
1433       case TYPE_U16: \
1434          res.data.dst = i->saturate ? \
1435                         CLAMP(imm0.reg.data.u16, umin, umax) : \
1436                         imm0.reg.data.u16; \
1437          break; \
1438       default: return; \
1439       } \
1440       i->setSrc(0, bld.mkImm(res.data.dst)); \
1441       break
1442 
1443       switch(i->dType) {
1444       CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1445       CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1446       CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1447       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1448       case TYPE_F32:
1449          switch (i->sType) {
1450          case TYPE_F64:
1451             res.data.f32 = i->saturate ?
1452                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1453                imm0.reg.data.f64;
1454             break;
1455          case TYPE_F32:
1456             res.data.f32 = i->saturate ?
1457                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1458                imm0.reg.data.f32;
1459             break;
1460          case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1461          case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1462          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1463          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1464          default:
1465             return;
1466          }
1467          i->setSrc(0, bld.mkImm(res.data.f32));
1468          break;
1469       case TYPE_F64:
1470          switch (i->sType) {
1471          case TYPE_F64:
1472             res.data.f64 = i->saturate ?
1473                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1474                imm0.reg.data.f64;
1475             break;
1476          case TYPE_F32:
1477             res.data.f64 = i->saturate ?
1478                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1479                imm0.reg.data.f32;
1480             break;
1481          case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1482          case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1483          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1484          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1485          default:
1486             return;
1487          }
1488          i->setSrc(0, bld.mkImm(res.data.f64));
1489          break;
1490       default:
1491          return;
1492       }
1493 #undef CASE
1494 
1495       i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1496       i->op = OP_MOV;
1497       i->saturate = 0;
1498       i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1499       break;
1500    }
1501    default:
1502       return;
1503    }
1504    if (newi->op != op)
1505       foldCount++;
1506 }
1507 
1508 // =============================================================================
1509 
1510 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1511 class ModifierFolding : public Pass
1512 {
1513 private:
1514    virtual bool visit(BasicBlock *);
1515 };
1516 
1517 bool
visit(BasicBlock * bb)1518 ModifierFolding::visit(BasicBlock *bb)
1519 {
1520    const Target *target = prog->getTarget();
1521 
1522    Instruction *i, *next, *mi;
1523    Modifier mod;
1524 
1525    for (i = bb->getEntry(); i; i = next) {
1526       next = i->next;
1527 
1528       if (0 && i->op == OP_SUB) {
1529          // turn "sub" into "add neg" (do we really want this ?)
1530          i->op = OP_ADD;
1531          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1532       }
1533 
1534       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1535          mi = i->getSrc(s)->getInsn();
1536          if (!mi ||
1537              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1538             continue;
1539          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1540             if ((i->op != OP_ADD &&
1541                  i->op != OP_MUL) ||
1542                 (mi->op != OP_ABS &&
1543                  mi->op != OP_NEG))
1544                continue;
1545          } else
1546          if (i->sType != mi->dType) {
1547             continue;
1548          }
1549          if ((mod = Modifier(mi->op)) == Modifier(0))
1550             continue;
1551          mod *= mi->src(0).mod;
1552 
1553          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1554             // abs neg [abs] = abs
1555             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1556          } else
1557          if ((i->op == OP_NEG) && mod.neg()) {
1558             assert(s == 0);
1559             // neg as both opcode and modifier on same insn is prohibited
1560             // neg neg abs = abs, neg neg = identity
1561             mod = mod & Modifier(~NV50_IR_MOD_NEG);
1562             i->op = mod.getOp();
1563             mod = mod & Modifier(~NV50_IR_MOD_ABS);
1564             if (mod == Modifier(0))
1565                i->op = OP_MOV;
1566          }
1567 
1568          if (target->isModSupported(i, s, mod)) {
1569             i->setSrc(s, mi->getSrc(0));
1570             i->src(s).mod *= mod;
1571          }
1572       }
1573 
1574       if (i->op == OP_SAT) {
1575          mi = i->getSrc(0)->getInsn();
1576          if (mi &&
1577              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1578             mi->saturate = 1;
1579             mi->setDef(0, i->getDef(0));
1580             delete_Instruction(prog, i);
1581          }
1582       }
1583    }
1584 
1585    return true;
1586 }
1587 
1588 // =============================================================================
1589 
1590 // MUL + ADD -> MAD/FMA
1591 // MIN/MAX(a, a) -> a, etc.
1592 // SLCT(a, b, const) -> cc(const) ? a : b
1593 // RCP(RCP(a)) -> a
1594 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1595 class AlgebraicOpt : public Pass
1596 {
1597 private:
1598    virtual bool visit(BasicBlock *);
1599 
1600    void handleABS(Instruction *);
1601    bool handleADD(Instruction *);
1602    bool tryADDToMADOrSAD(Instruction *, operation toOp);
1603    void handleMINMAX(Instruction *);
1604    void handleRCP(Instruction *);
1605    void handleSLCT(Instruction *);
1606    void handleLOGOP(Instruction *);
1607    void handleCVT_NEG(Instruction *);
1608    void handleCVT_CVT(Instruction *);
1609    void handleCVT_EXTBF(Instruction *);
1610    void handleSUCLAMP(Instruction *);
1611    void handleNEG(Instruction *);
1612 
1613    BuildUtil bld;
1614 };
1615 
1616 void
handleABS(Instruction * abs)1617 AlgebraicOpt::handleABS(Instruction *abs)
1618 {
1619    Instruction *sub = abs->getSrc(0)->getInsn();
1620    DataType ty;
1621    if (!sub ||
1622        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1623       return;
1624    // expect not to have mods yet, if we do, bail
1625    if (sub->src(0).mod || sub->src(1).mod)
1626       return;
1627    // hidden conversion ?
1628    ty = intTypeToSigned(sub->dType);
1629    if (abs->dType != abs->sType || ty != abs->sType)
1630       return;
1631 
1632    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1633        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1634        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1635          return;
1636 
1637    Value *src0 = sub->getSrc(0);
1638    Value *src1 = sub->getSrc(1);
1639 
1640    if (sub->op == OP_ADD) {
1641       Instruction *neg = sub->getSrc(1)->getInsn();
1642       if (neg && neg->op != OP_NEG) {
1643          neg = sub->getSrc(0)->getInsn();
1644          src0 = sub->getSrc(1);
1645       }
1646       if (!neg || neg->op != OP_NEG ||
1647           neg->dType != neg->sType || neg->sType != ty)
1648          return;
1649       src1 = neg->getSrc(0);
1650    }
1651 
1652    // found ABS(SUB))
1653    abs->moveSources(1, 2); // move sources >=1 up by 2
1654    abs->op = OP_SAD;
1655    abs->setType(sub->dType);
1656    abs->setSrc(0, src0);
1657    abs->setSrc(1, src1);
1658    bld.setPosition(abs, false);
1659    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1660 }
1661 
1662 bool
handleADD(Instruction * add)1663 AlgebraicOpt::handleADD(Instruction *add)
1664 {
1665    Value *src0 = add->getSrc(0);
1666    Value *src1 = add->getSrc(1);
1667 
1668    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1669       return false;
1670 
1671    bool changed = false;
1672    if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1673       changed = tryADDToMADOrSAD(add, OP_MAD);
1674    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1675       changed = tryADDToMADOrSAD(add, OP_SAD);
1676    return changed;
1677 }
1678 
1679 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1680 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1681 bool
tryADDToMADOrSAD(Instruction * add,operation toOp)1682 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1683 {
1684    Value *src0 = add->getSrc(0);
1685    Value *src1 = add->getSrc(1);
1686    Value *src;
1687    int s;
1688    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1689    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1690    Modifier mod[4];
1691 
1692    if (src0->refCount() == 1 &&
1693        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1694       s = 0;
1695    else
1696    if (src1->refCount() == 1 &&
1697        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1698       s = 1;
1699    else
1700       return false;
1701 
1702    src = add->getSrc(s);
1703 
1704    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1705       return false;
1706 
1707    if (src->getInsn()->saturate || src->getInsn()->postFactor ||
1708        src->getInsn()->dnz)
1709       return false;
1710 
1711    if (toOp == OP_SAD) {
1712       ImmediateValue imm;
1713       if (!src->getInsn()->src(2).getImmediate(imm))
1714          return false;
1715       if (!imm.isInteger(0))
1716          return false;
1717    }
1718 
1719    if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1720        isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1721       return false;
1722 
1723    mod[0] = add->src(0).mod;
1724    mod[1] = add->src(1).mod;
1725    mod[2] = src->getUniqueInsn()->src(0).mod;
1726    mod[3] = src->getUniqueInsn()->src(1).mod;
1727 
1728    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1729       return false;
1730 
1731    add->op = toOp;
1732    add->subOp = src->getInsn()->subOp; // potentially mul-high
1733    add->dType = src->getInsn()->dType; // sign matters for imad hi
1734    add->sType = src->getInsn()->sType;
1735 
1736    add->setSrc(2, add->src(s ? 0 : 1));
1737 
1738    add->setSrc(0, src->getInsn()->getSrc(0));
1739    add->src(0).mod = mod[2] ^ mod[s];
1740    add->setSrc(1, src->getInsn()->getSrc(1));
1741    add->src(1).mod = mod[3];
1742 
1743    return true;
1744 }
1745 
1746 void
handleMINMAX(Instruction * minmax)1747 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1748 {
1749    Value *src0 = minmax->getSrc(0);
1750    Value *src1 = minmax->getSrc(1);
1751 
1752    if (src0 != src1 || src0->reg.file != FILE_GPR)
1753       return;
1754    if (minmax->src(0).mod == minmax->src(1).mod) {
1755       if (minmax->def(0).mayReplace(minmax->src(0))) {
1756          minmax->def(0).replace(minmax->src(0), false);
1757          minmax->bb->remove(minmax);
1758       } else {
1759          minmax->op = OP_CVT;
1760          minmax->setSrc(1, NULL);
1761       }
1762    } else {
1763       // TODO:
1764       // min(x, -x) = -abs(x)
1765       // min(x, -abs(x)) = -abs(x)
1766       // min(x, abs(x)) = x
1767       // max(x, -abs(x)) = x
1768       // max(x, abs(x)) = abs(x)
1769       // max(x, -x) = abs(x)
1770    }
1771 }
1772 
1773 void
handleRCP(Instruction * rcp)1774 AlgebraicOpt::handleRCP(Instruction *rcp)
1775 {
1776    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1777 
1778    if (si && si->op == OP_RCP) {
1779       Modifier mod = rcp->src(0).mod * si->src(0).mod;
1780       rcp->op = mod.getOp();
1781       rcp->setSrc(0, si->getSrc(0));
1782    }
1783 }
1784 
1785 void
handleSLCT(Instruction * slct)1786 AlgebraicOpt::handleSLCT(Instruction *slct)
1787 {
1788    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1789       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1790          slct->setSrc(0, slct->getSrc(1));
1791    } else
1792    if (slct->getSrc(0) != slct->getSrc(1)) {
1793       return;
1794    }
1795    slct->op = OP_MOV;
1796    slct->setSrc(1, NULL);
1797    slct->setSrc(2, NULL);
1798 }
1799 
1800 void
handleLOGOP(Instruction * logop)1801 AlgebraicOpt::handleLOGOP(Instruction *logop)
1802 {
1803    Value *src0 = logop->getSrc(0);
1804    Value *src1 = logop->getSrc(1);
1805 
1806    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1807       return;
1808 
1809    if (src0 == src1) {
1810       if ((logop->op == OP_AND || logop->op == OP_OR) &&
1811           logop->def(0).mayReplace(logop->src(0))) {
1812          logop->def(0).replace(logop->src(0), false);
1813          delete_Instruction(prog, logop);
1814       }
1815    } else {
1816       // try AND(SET, SET) -> SET_AND(SET)
1817       Instruction *set0 = src0->getInsn();
1818       Instruction *set1 = src1->getInsn();
1819 
1820       if (!set0 || set0->fixed || !set1 || set1->fixed)
1821          return;
1822       if (set1->op != OP_SET) {
1823          Instruction *xchg = set0;
1824          set0 = set1;
1825          set1 = xchg;
1826          if (set1->op != OP_SET)
1827             return;
1828       }
1829       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1830                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1831       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1832          return;
1833       if (set0->op != OP_SET &&
1834           set0->op != OP_SET_AND &&
1835           set0->op != OP_SET_OR &&
1836           set0->op != OP_SET_XOR)
1837          return;
1838       if (set0->getDef(0)->refCount() > 1 &&
1839           set1->getDef(0)->refCount() > 1)
1840          return;
1841       if (set0->getPredicate() || set1->getPredicate())
1842          return;
1843       // check that they don't source each other
1844       for (int s = 0; s < 2; ++s)
1845          if (set0->getSrc(s) == set1->getDef(0) ||
1846              set1->getSrc(s) == set0->getDef(0))
1847             return;
1848 
1849       set0 = cloneForward(func, set0);
1850       set1 = cloneShallow(func, set1);
1851       logop->bb->insertAfter(logop, set1);
1852       logop->bb->insertAfter(logop, set0);
1853 
1854       set0->dType = TYPE_U8;
1855       set0->getDef(0)->reg.file = FILE_PREDICATE;
1856       set0->getDef(0)->reg.size = 1;
1857       set1->setSrc(2, set0->getDef(0));
1858       set1->op = redOp;
1859       set1->setDef(0, logop->getDef(0));
1860       delete_Instruction(prog, logop);
1861    }
1862 }
1863 
1864 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1865 // nv50:
1866 //  F2I(NEG(I2F(ABS(SET))))
1867 void
handleCVT_NEG(Instruction * cvt)1868 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
1869 {
1870    Instruction *insn = cvt->getSrc(0)->getInsn();
1871    if (cvt->sType != TYPE_F32 ||
1872        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
1873       return;
1874    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
1875       return;
1876    if (insn->src(0).mod != Modifier(0))
1877       return;
1878    insn = insn->getSrc(0)->getInsn();
1879 
1880    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1881    if (insn && insn->op == OP_CVT &&
1882        insn->dType == TYPE_F32 &&
1883        insn->sType == TYPE_S32) {
1884       insn = insn->getSrc(0)->getInsn();
1885       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
1886           insn->src(0).mod)
1887          return;
1888       insn = insn->getSrc(0)->getInsn();
1889       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
1890          return;
1891    } else
1892    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
1893       return;
1894    }
1895 
1896    Instruction *bset = cloneShallow(func, insn);
1897    bset->dType = TYPE_U32;
1898    bset->setDef(0, cvt->getDef(0));
1899    cvt->bb->insertAfter(cvt, bset);
1900    delete_Instruction(prog, cvt);
1901 }
1902 
1903 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
1904 // does a type conversion, this becomes trickier as there might be range
1905 // changes/etc. We could handle those in theory as long as the range was being
1906 // reduced or kept the same.
1907 void
handleCVT_CVT(Instruction * cvt)1908 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
1909 {
1910    Instruction *insn = cvt->getSrc(0)->getInsn();
1911    RoundMode rnd = insn->rnd;
1912 
1913    if (insn->saturate ||
1914        insn->subOp ||
1915        insn->dType != insn->sType ||
1916        insn->dType != cvt->sType)
1917       return;
1918 
1919    switch (insn->op) {
1920    case OP_CEIL:
1921       rnd = ROUND_PI;
1922       break;
1923    case OP_FLOOR:
1924       rnd = ROUND_MI;
1925       break;
1926    case OP_TRUNC:
1927       rnd = ROUND_ZI;
1928       break;
1929    case OP_CVT:
1930       break;
1931    default:
1932       return;
1933    }
1934 
1935    if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
1936       rnd = (RoundMode)(rnd & 3);
1937 
1938    cvt->rnd = rnd;
1939    cvt->setSrc(0, insn->getSrc(0));
1940    cvt->src(0).mod *= insn->src(0).mod;
1941    cvt->sType = insn->sType;
1942 }
1943 
1944 // Some shaders extract packed bytes out of words and convert them to
1945 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
1946 // nv50 for word sizes.
1947 //
1948 // CVT(EXTBF(x, byte/word))
1949 // CVT(AND(bytemask, x))
1950 // CVT(AND(bytemask, SHR(x, 8/16/24)))
1951 // CVT(SHR(x, 16/24))
1952 void
handleCVT_EXTBF(Instruction * cvt)1953 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
1954 {
1955    Instruction *insn = cvt->getSrc(0)->getInsn();
1956    ImmediateValue imm;
1957    Value *arg = NULL;
1958    unsigned width, offset;
1959    if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
1960       return;
1961    if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
1962       width = (imm.reg.data.u32 >> 8) & 0xff;
1963       offset = imm.reg.data.u32 & 0xff;
1964       arg = insn->getSrc(0);
1965 
1966       if (width != 8 && width != 16)
1967          return;
1968       if (width == 8 && offset & 0x7)
1969          return;
1970       if (width == 16 && offset & 0xf)
1971          return;
1972    } else if (insn->op == OP_AND) {
1973       int s;
1974       if (insn->src(0).getImmediate(imm))
1975          s = 0;
1976       else if (insn->src(1).getImmediate(imm))
1977          s = 1;
1978       else
1979          return;
1980 
1981       if (imm.reg.data.u32 == 0xff)
1982          width = 8;
1983       else if (imm.reg.data.u32 == 0xffff)
1984          width = 16;
1985       else
1986          return;
1987 
1988       arg = insn->getSrc(!s);
1989       Instruction *shift = arg->getInsn();
1990       offset = 0;
1991       if (shift && shift->op == OP_SHR &&
1992           shift->sType == cvt->sType &&
1993           shift->src(1).getImmediate(imm) &&
1994           ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
1995            (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
1996          arg = shift->getSrc(0);
1997          offset = imm.reg.data.u32;
1998       }
1999       // We just AND'd the high bits away, which means this is effectively an
2000       // unsigned value.
2001       cvt->sType = TYPE_U32;
2002    } else if (insn->op == OP_SHR &&
2003               insn->sType == cvt->sType &&
2004               insn->src(1).getImmediate(imm)) {
2005       arg = insn->getSrc(0);
2006       if (imm.reg.data.u32 == 24) {
2007          width = 8;
2008          offset = 24;
2009       } else if (imm.reg.data.u32 == 16) {
2010          width = 16;
2011          offset = 16;
2012       } else {
2013          return;
2014       }
2015    }
2016 
2017    if (!arg)
2018       return;
2019 
2020    // Irrespective of what came earlier, we can undo a shift on the argument
2021    // by adjusting the offset.
2022    Instruction *shift = arg->getInsn();
2023    if (shift && shift->op == OP_SHL &&
2024        shift->src(1).getImmediate(imm) &&
2025        ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2026         (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
2027        imm.reg.data.u32 <= offset) {
2028       arg = shift->getSrc(0);
2029       offset -= imm.reg.data.u32;
2030    }
2031 
2032    // The unpackSnorm lowering still leaves a few shifts behind, but it's too
2033    // annoying to detect them.
2034 
2035    if (width == 8) {
2036       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
2037    } else {
2038       assert(width == 16);
2039       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
2040    }
2041    cvt->setSrc(0, arg);
2042    cvt->subOp = offset >> 3;
2043 }
2044 
2045 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2046 void
handleSUCLAMP(Instruction * insn)2047 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
2048 {
2049    ImmediateValue imm;
2050    int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
2051    int s;
2052    Instruction *add;
2053 
2054    assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
2055 
2056    // look for ADD (TODO: only count references by non-SUCLAMP)
2057    if (insn->getSrc(0)->refCount() > 1)
2058       return;
2059    add = insn->getSrc(0)->getInsn();
2060    if (!add || add->op != OP_ADD ||
2061        (add->dType != TYPE_U32 &&
2062         add->dType != TYPE_S32))
2063       return;
2064 
2065    // look for immediate
2066    for (s = 0; s < 2; ++s)
2067       if (add->src(s).getImmediate(imm))
2068          break;
2069    if (s >= 2)
2070       return;
2071    s = s ? 0 : 1;
2072    // determine if immediate fits
2073    val += imm.reg.data.s32;
2074    if (val > 31 || val < -32)
2075       return;
2076    // determine if other addend fits
2077    if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
2078       return;
2079 
2080    bld.setPosition(insn, false); // make sure bld is init'ed
2081    // replace sources
2082    insn->setSrc(2, bld.mkImm(val));
2083    insn->setSrc(0, add->getSrc(s));
2084 }
2085 
2086 // NEG(AND(SET, 1)) -> SET
2087 void
handleNEG(Instruction * i)2088 AlgebraicOpt::handleNEG(Instruction *i) {
2089    Instruction *src = i->getSrc(0)->getInsn();
2090    ImmediateValue imm;
2091    int b;
2092 
2093    if (isFloatType(i->sType) || !src || src->op != OP_AND)
2094       return;
2095 
2096    if (src->src(0).getImmediate(imm))
2097       b = 1;
2098    else if (src->src(1).getImmediate(imm))
2099       b = 0;
2100    else
2101       return;
2102 
2103    if (!imm.isInteger(1))
2104       return;
2105 
2106    Instruction *set = src->getSrc(b)->getInsn();
2107    if ((set->op == OP_SET || set->op == OP_SET_AND ||
2108        set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
2109        !isFloatType(set->dType)) {
2110       i->def(0).replace(set->getDef(0), false);
2111    }
2112 }
2113 
2114 bool
visit(BasicBlock * bb)2115 AlgebraicOpt::visit(BasicBlock *bb)
2116 {
2117    Instruction *next;
2118    for (Instruction *i = bb->getEntry(); i; i = next) {
2119       next = i->next;
2120       switch (i->op) {
2121       case OP_ABS:
2122          handleABS(i);
2123          break;
2124       case OP_ADD:
2125          handleADD(i);
2126          break;
2127       case OP_RCP:
2128          handleRCP(i);
2129          break;
2130       case OP_MIN:
2131       case OP_MAX:
2132          handleMINMAX(i);
2133          break;
2134       case OP_SLCT:
2135          handleSLCT(i);
2136          break;
2137       case OP_AND:
2138       case OP_OR:
2139       case OP_XOR:
2140          handleLOGOP(i);
2141          break;
2142       case OP_CVT:
2143          handleCVT_NEG(i);
2144          handleCVT_CVT(i);
2145          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
2146              handleCVT_EXTBF(i);
2147          break;
2148       case OP_SUCLAMP:
2149          handleSUCLAMP(i);
2150          break;
2151       case OP_NEG:
2152          handleNEG(i);
2153          break;
2154       default:
2155          break;
2156       }
2157    }
2158 
2159    return true;
2160 }
2161 
2162 // =============================================================================
2163 
2164 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2165 class LateAlgebraicOpt : public Pass
2166 {
2167 private:
2168    virtual bool visit(Instruction *);
2169 
2170    void handleADD(Instruction *);
2171    bool tryADDToSHLADD(Instruction *);
2172 };
2173 
2174 void
handleADD(Instruction * add)2175 LateAlgebraicOpt::handleADD(Instruction *add)
2176 {
2177    Value *src0 = add->getSrc(0);
2178    Value *src1 = add->getSrc(1);
2179 
2180    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2181       return;
2182 
2183    if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
2184       tryADDToSHLADD(add);
2185 }
2186 
2187 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2188 bool
tryADDToSHLADD(Instruction * add)2189 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
2190 {
2191    Value *src0 = add->getSrc(0);
2192    Value *src1 = add->getSrc(1);
2193    ImmediateValue imm;
2194    Instruction *shl;
2195    Value *src;
2196    int s;
2197 
2198    if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
2199        || isFloatType(add->dType))
2200       return false;
2201 
2202    if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
2203       s = 0;
2204    else
2205    if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
2206       s = 1;
2207    else
2208       return false;
2209 
2210    src = add->getSrc(s);
2211    shl = src->getUniqueInsn();
2212 
2213    if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
2214       return false;
2215 
2216    if (!shl->src(1).getImmediate(imm))
2217       return false;
2218 
2219    add->op = OP_SHLADD;
2220    add->setSrc(2, add->src(!s));
2221    // SHL can't have any modifiers, but the ADD source may have had
2222    // one. Preserve it.
2223    add->setSrc(0, shl->getSrc(0));
2224    if (s == 1)
2225       add->src(0).mod = add->src(1).mod;
2226    add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
2227    add->src(1).mod = Modifier(0);
2228 
2229    return true;
2230 }
2231 
2232 bool
visit(Instruction * i)2233 LateAlgebraicOpt::visit(Instruction *i)
2234 {
2235    switch (i->op) {
2236    case OP_ADD:
2237       handleADD(i);
2238       break;
2239    default:
2240       break;
2241    }
2242 
2243    return true;
2244 }
2245 
2246 // =============================================================================
2247 
2248 static inline void
updateLdStOffset(Instruction * ldst,int32_t offset,Function * fn)2249 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
2250 {
2251    if (offset != ldst->getSrc(0)->reg.data.offset) {
2252       if (ldst->getSrc(0)->refCount() > 1)
2253          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
2254       ldst->getSrc(0)->reg.data.offset = offset;
2255    }
2256 }
2257 
2258 // Combine loads and stores, forward stores to loads where possible.
2259 class MemoryOpt : public Pass
2260 {
2261 private:
2262    class Record
2263    {
2264    public:
2265       Record *next;
2266       Instruction *insn;
2267       const Value *rel[2];
2268       const Value *base;
2269       int32_t offset;
2270       int8_t fileIndex;
2271       uint8_t size;
2272       bool locked;
2273       Record *prev;
2274 
2275       bool overlaps(const Instruction *ldst) const;
2276 
2277       inline void link(Record **);
2278       inline void unlink(Record **);
2279       inline void set(const Instruction *ldst);
2280    };
2281 
2282 public:
2283    MemoryOpt();
2284 
2285    Record *loads[DATA_FILE_COUNT];
2286    Record *stores[DATA_FILE_COUNT];
2287 
2288    MemoryPool recordPool;
2289 
2290 private:
2291    virtual bool visit(BasicBlock *);
2292    bool runOpt(BasicBlock *);
2293 
2294    Record **getList(const Instruction *);
2295 
2296    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2297 
2298    // merge @insn into load/store instruction from @rec
2299    bool combineLd(Record *rec, Instruction *ld);
2300    bool combineSt(Record *rec, Instruction *st);
2301 
2302    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2303    bool replaceLdFromSt(Instruction *ld, Record *stRec);
2304    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2305 
2306    void addRecord(Instruction *ldst);
2307    void purgeRecords(Instruction *const st, DataFile);
2308    void lockStores(Instruction *const ld);
2309    void reset();
2310 
2311 private:
2312    Record *prevRecord;
2313 };
2314 
MemoryOpt()2315 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2316 {
2317    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2318       loads[i] = NULL;
2319       stores[i] = NULL;
2320    }
2321    prevRecord = NULL;
2322 }
2323 
2324 void
reset()2325 MemoryOpt::reset()
2326 {
2327    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2328       Record *it, *next;
2329       for (it = loads[i]; it; it = next) {
2330          next = it->next;
2331          recordPool.release(it);
2332       }
2333       loads[i] = NULL;
2334       for (it = stores[i]; it; it = next) {
2335          next = it->next;
2336          recordPool.release(it);
2337       }
2338       stores[i] = NULL;
2339    }
2340 }
2341 
2342 bool
combineLd(Record * rec,Instruction * ld)2343 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2344 {
2345    int32_t offRc = rec->offset;
2346    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2347    int sizeRc = rec->size;
2348    int sizeLd = typeSizeof(ld->dType);
2349    int size = sizeRc + sizeLd;
2350    int d, j;
2351 
2352    if (!prog->getTarget()->
2353        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2354       return false;
2355    // no unaligned loads
2356    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2357        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2358       return false;
2359    // for compute indirect loads are not guaranteed to be aligned
2360    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2361       return false;
2362 
2363    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2364 
2365    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2366 
2367    if (offLd < offRc) {
2368       int sz;
2369       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2370       // d: nr of definitions in ld
2371       // j: nr of definitions in rec->insn, move:
2372       for (d = d + j - 1; j > 0; --j, --d)
2373          rec->insn->setDef(d, rec->insn->getDef(j - 1));
2374 
2375       if (rec->insn->getSrc(0)->refCount() > 1)
2376          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2377       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2378 
2379       d = 0;
2380    } else {
2381       d = j;
2382    }
2383    // move definitions of @ld to @rec->insn
2384    for (j = 0; sizeLd; ++j, ++d) {
2385       sizeLd -= ld->getDef(j)->reg.size;
2386       rec->insn->setDef(d, ld->getDef(j));
2387    }
2388 
2389    rec->size = size;
2390    rec->insn->getSrc(0)->reg.size = size;
2391    rec->insn->setType(typeOfSize(size));
2392 
2393    delete_Instruction(prog, ld);
2394 
2395    return true;
2396 }
2397 
2398 bool
combineSt(Record * rec,Instruction * st)2399 MemoryOpt::combineSt(Record *rec, Instruction *st)
2400 {
2401    int32_t offRc = rec->offset;
2402    int32_t offSt = st->getSrc(0)->reg.data.offset;
2403    int sizeRc = rec->size;
2404    int sizeSt = typeSizeof(st->dType);
2405    int s = sizeSt / 4;
2406    int size = sizeRc + sizeSt;
2407    int j, k;
2408    Value *src[4]; // no modifiers in ValueRef allowed for st
2409    Value *extra[3];
2410 
2411    if (!prog->getTarget()->
2412        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2413       return false;
2414    // no unaligned stores
2415    if (size == 8 && MIN2(offRc, offSt) & 0x7)
2416       return false;
2417    // for compute indirect stores are not guaranteed to be aligned
2418    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2419       return false;
2420 
2421    st->takeExtraSources(0, extra); // save predicate and indirect address
2422 
2423    if (offRc < offSt) {
2424       // save values from @st
2425       for (s = 0; sizeSt; ++s) {
2426          sizeSt -= st->getSrc(s + 1)->reg.size;
2427          src[s] = st->getSrc(s + 1);
2428       }
2429       // set record's values as low sources of @st
2430       for (j = 1; sizeRc; ++j) {
2431          sizeRc -= rec->insn->getSrc(j)->reg.size;
2432          st->setSrc(j, rec->insn->getSrc(j));
2433       }
2434       // set saved values as high sources of @st
2435       for (k = j, j = 0; j < s; ++j)
2436          st->setSrc(k++, src[j]);
2437 
2438       updateLdStOffset(st, offRc, func);
2439    } else {
2440       for (j = 1; sizeSt; ++j)
2441          sizeSt -= st->getSrc(j)->reg.size;
2442       for (s = 1; sizeRc; ++j, ++s) {
2443          sizeRc -= rec->insn->getSrc(s)->reg.size;
2444          st->setSrc(j, rec->insn->getSrc(s));
2445       }
2446       rec->offset = offSt;
2447    }
2448    st->putExtraSources(0, extra); // restore pointer and predicate
2449 
2450    delete_Instruction(prog, rec->insn);
2451    rec->insn = st;
2452    rec->size = size;
2453    rec->insn->getSrc(0)->reg.size = size;
2454    rec->insn->setType(typeOfSize(size));
2455    return true;
2456 }
2457 
2458 void
set(const Instruction * ldst)2459 MemoryOpt::Record::set(const Instruction *ldst)
2460 {
2461    const Symbol *mem = ldst->getSrc(0)->asSym();
2462    fileIndex = mem->reg.fileIndex;
2463    rel[0] = ldst->getIndirect(0, 0);
2464    rel[1] = ldst->getIndirect(0, 1);
2465    offset = mem->reg.data.offset;
2466    base = mem->getBase();
2467    size = typeSizeof(ldst->sType);
2468 }
2469 
2470 void
link(Record ** list)2471 MemoryOpt::Record::link(Record **list)
2472 {
2473    next = *list;
2474    if (next)
2475       next->prev = this;
2476    prev = NULL;
2477    *list = this;
2478 }
2479 
2480 void
unlink(Record ** list)2481 MemoryOpt::Record::unlink(Record **list)
2482 {
2483    if (next)
2484       next->prev = prev;
2485    if (prev)
2486       prev->next = next;
2487    else
2488       *list = next;
2489 }
2490 
2491 MemoryOpt::Record **
getList(const Instruction * insn)2492 MemoryOpt::getList(const Instruction *insn)
2493 {
2494    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2495       return &loads[insn->src(0).getFile()];
2496    return &stores[insn->src(0).getFile()];
2497 }
2498 
2499 void
addRecord(Instruction * i)2500 MemoryOpt::addRecord(Instruction *i)
2501 {
2502    Record **list = getList(i);
2503    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2504 
2505    it->link(list);
2506    it->set(i);
2507    it->insn = i;
2508    it->locked = false;
2509 }
2510 
2511 MemoryOpt::Record *
findRecord(const Instruction * insn,bool load,bool & isAdj) const2512 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2513 {
2514    const Symbol *sym = insn->getSrc(0)->asSym();
2515    const int size = typeSizeof(insn->sType);
2516    Record *rec = NULL;
2517    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2518 
2519    for (; it; it = it->next) {
2520       if (it->locked && insn->op != OP_LOAD)
2521          continue;
2522       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2523           it->rel[0] != insn->getIndirect(0, 0) ||
2524           it->fileIndex != sym->reg.fileIndex ||
2525           it->rel[1] != insn->getIndirect(0, 1))
2526          continue;
2527 
2528       if (it->offset < sym->reg.data.offset) {
2529          if (it->offset + it->size >= sym->reg.data.offset) {
2530             isAdj = (it->offset + it->size == sym->reg.data.offset);
2531             if (!isAdj)
2532                return it;
2533             if (!(it->offset & 0x7))
2534                rec = it;
2535          }
2536       } else {
2537          isAdj = it->offset != sym->reg.data.offset;
2538          if (size <= it->size && !isAdj)
2539             return it;
2540          else
2541          if (!(sym->reg.data.offset & 0x7))
2542             if (it->offset - size <= sym->reg.data.offset)
2543                rec = it;
2544       }
2545    }
2546    return rec;
2547 }
2548 
2549 bool
replaceLdFromSt(Instruction * ld,Record * rec)2550 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2551 {
2552    Instruction *st = rec->insn;
2553    int32_t offSt = rec->offset;
2554    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2555    int d, s;
2556 
2557    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
2558       offSt += st->getSrc(s)->reg.size;
2559    if (offSt != offLd)
2560       return false;
2561 
2562    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
2563       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
2564          return false;
2565       if (st->getSrc(s)->reg.file != FILE_GPR)
2566          return false;
2567       ld->def(d).replace(st->src(s), false);
2568    }
2569    ld->bb->remove(ld);
2570    return true;
2571 }
2572 
2573 bool
replaceLdFromLd(Instruction * ldE,Record * rec)2574 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
2575 {
2576    Instruction *ldR = rec->insn;
2577    int32_t offR = rec->offset;
2578    int32_t offE = ldE->getSrc(0)->reg.data.offset;
2579    int dR, dE;
2580 
2581    assert(offR <= offE);
2582    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
2583       offR += ldR->getDef(dR)->reg.size;
2584    if (offR != offE)
2585       return false;
2586 
2587    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
2588       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
2589          return false;
2590       ldE->def(dE).replace(ldR->getDef(dR), false);
2591    }
2592 
2593    delete_Instruction(prog, ldE);
2594    return true;
2595 }
2596 
2597 bool
replaceStFromSt(Instruction * restrict st,Record * rec)2598 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
2599 {
2600    const Instruction *const ri = rec->insn;
2601    Value *extra[3];
2602 
2603    int32_t offS = st->getSrc(0)->reg.data.offset;
2604    int32_t offR = rec->offset;
2605    int32_t endS = offS + typeSizeof(st->dType);
2606    int32_t endR = offR + typeSizeof(ri->dType);
2607 
2608    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
2609 
2610    st->takeExtraSources(0, extra);
2611 
2612    if (offR < offS) {
2613       Value *vals[10];
2614       int s, n;
2615       int k = 0;
2616       // get non-replaced sources of ri
2617       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
2618          vals[k++] = ri->getSrc(s);
2619       n = s;
2620       // get replaced sources of st
2621       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
2622          vals[k++] = st->getSrc(s);
2623       // skip replaced sources of ri
2624       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
2625       // get non-replaced sources after values covered by st
2626       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
2627          vals[k++] = ri->getSrc(s);
2628       assert((unsigned int)k <= ARRAY_SIZE(vals));
2629       for (s = 0; s < k; ++s)
2630          st->setSrc(s + 1, vals[s]);
2631       st->setSrc(0, ri->getSrc(0));
2632    } else
2633    if (endR > endS) {
2634       int j, s;
2635       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
2636       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
2637       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
2638          st->setSrc(s++, ri->getSrc(j));
2639    }
2640    st->putExtraSources(0, extra);
2641 
2642    delete_Instruction(prog, rec->insn);
2643 
2644    rec->insn = st;
2645    rec->offset = st->getSrc(0)->reg.data.offset;
2646 
2647    st->setType(typeOfSize(rec->size));
2648 
2649    return true;
2650 }
2651 
2652 bool
overlaps(const Instruction * ldst) const2653 MemoryOpt::Record::overlaps(const Instruction *ldst) const
2654 {
2655    Record that;
2656    that.set(ldst);
2657 
2658    if (this->fileIndex != that.fileIndex)
2659       return false;
2660 
2661    if (this->rel[0] || that.rel[0])
2662       return this->base == that.base;
2663    return
2664       (this->offset < that.offset + that.size) &&
2665       (this->offset + this->size > that.offset);
2666 }
2667 
2668 // We must not eliminate stores that affect the result of @ld if
2669 // we find later stores to the same location, and we may no longer
2670 // merge them with later stores.
2671 // The stored value can, however, still be used to determine the value
2672 // returned by future loads.
2673 void
lockStores(Instruction * const ld)2674 MemoryOpt::lockStores(Instruction *const ld)
2675 {
2676    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
2677       if (!r->locked && r->overlaps(ld))
2678          r->locked = true;
2679 }
2680 
2681 // Prior loads from the location of @st are no longer valid.
2682 // Stores to the location of @st may no longer be used to derive
2683 // the value at it nor be coalesced into later stores.
2684 void
purgeRecords(Instruction * const st,DataFile f)2685 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
2686 {
2687    if (st)
2688       f = st->src(0).getFile();
2689 
2690    for (Record *r = loads[f]; r; r = r->next)
2691       if (!st || r->overlaps(st))
2692          r->unlink(&loads[f]);
2693 
2694    for (Record *r = stores[f]; r; r = r->next)
2695       if (!st || r->overlaps(st))
2696          r->unlink(&stores[f]);
2697 }
2698 
2699 bool
visit(BasicBlock * bb)2700 MemoryOpt::visit(BasicBlock *bb)
2701 {
2702    bool ret = runOpt(bb);
2703    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
2704    // where 96 bit memory operations are forbidden.
2705    if (ret)
2706       ret = runOpt(bb);
2707    return ret;
2708 }
2709 
2710 bool
runOpt(BasicBlock * bb)2711 MemoryOpt::runOpt(BasicBlock *bb)
2712 {
2713    Instruction *ldst, *next;
2714    Record *rec;
2715    bool isAdjacent = true;
2716 
2717    for (ldst = bb->getEntry(); ldst; ldst = next) {
2718       bool keep = true;
2719       bool isLoad = true;
2720       next = ldst->next;
2721 
2722       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
2723          if (ldst->isDead()) {
2724             // might have been produced by earlier optimization
2725             delete_Instruction(prog, ldst);
2726             continue;
2727          }
2728       } else
2729       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
2730          if (typeSizeof(ldst->dType) == 4 &&
2731              ldst->src(1).getFile() == FILE_GPR &&
2732              ldst->getSrc(1)->getInsn()->op == OP_NOP) {
2733             delete_Instruction(prog, ldst);
2734             continue;
2735          }
2736          isLoad = false;
2737       } else {
2738          // TODO: maybe have all fixed ops act as barrier ?
2739          if (ldst->op == OP_CALL ||
2740              ldst->op == OP_BAR ||
2741              ldst->op == OP_MEMBAR) {
2742             purgeRecords(NULL, FILE_MEMORY_LOCAL);
2743             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2744             purgeRecords(NULL, FILE_MEMORY_SHARED);
2745             purgeRecords(NULL, FILE_SHADER_OUTPUT);
2746          } else
2747          if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
2748             if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
2749                purgeRecords(NULL, FILE_MEMORY_LOCAL);
2750                purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2751                purgeRecords(NULL, FILE_MEMORY_SHARED);
2752             } else {
2753                purgeRecords(NULL, ldst->src(0).getFile());
2754             }
2755          } else
2756          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
2757             purgeRecords(NULL, FILE_SHADER_OUTPUT);
2758          }
2759          continue;
2760       }
2761       if (ldst->getPredicate()) // TODO: handle predicated ld/st
2762          continue;
2763       if (ldst->perPatch) // TODO: create separate per-patch lists
2764          continue;
2765 
2766       if (isLoad) {
2767          DataFile file = ldst->src(0).getFile();
2768 
2769          // if ld l[]/g[] look for previous store to eliminate the reload
2770          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
2771             // TODO: shared memory ?
2772             rec = findRecord(ldst, false, isAdjacent);
2773             if (rec && !isAdjacent)
2774                keep = !replaceLdFromSt(ldst, rec);
2775          }
2776 
2777          // or look for ld from the same location and replace this one
2778          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
2779          if (rec) {
2780             if (!isAdjacent)
2781                keep = !replaceLdFromLd(ldst, rec);
2782             else
2783                // or combine a previous load with this one
2784                keep = !combineLd(rec, ldst);
2785          }
2786          if (keep)
2787             lockStores(ldst);
2788       } else {
2789          rec = findRecord(ldst, false, isAdjacent);
2790          if (rec) {
2791             if (!isAdjacent)
2792                keep = !replaceStFromSt(ldst, rec);
2793             else
2794                keep = !combineSt(rec, ldst);
2795          }
2796          if (keep)
2797             purgeRecords(ldst, DATA_FILE_COUNT);
2798       }
2799       if (keep)
2800          addRecord(ldst);
2801    }
2802    reset();
2803 
2804    return true;
2805 }
2806 
2807 // =============================================================================
2808 
2809 // Turn control flow into predicated instructions (after register allocation !).
2810 // TODO:
2811 // Could move this to before register allocation on NVC0 and also handle nested
2812 // constructs.
2813 class FlatteningPass : public Pass
2814 {
2815 private:
2816    virtual bool visit(Function *);
2817    virtual bool visit(BasicBlock *);
2818 
2819    bool tryPredicateConditional(BasicBlock *);
2820    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
2821    void tryPropagateBranch(BasicBlock *);
2822    inline bool isConstantCondition(Value *pred);
2823    inline bool mayPredicate(const Instruction *, const Value *pred) const;
2824    inline void removeFlow(Instruction *);
2825 
2826    uint8_t gpr_unit;
2827 };
2828 
2829 bool
isConstantCondition(Value * pred)2830 FlatteningPass::isConstantCondition(Value *pred)
2831 {
2832    Instruction *insn = pred->getUniqueInsn();
2833    assert(insn);
2834    if (insn->op != OP_SET || insn->srcExists(2))
2835       return false;
2836 
2837    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
2838       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
2839       DataFile file;
2840       if (ld) {
2841          if (ld->op != OP_MOV && ld->op != OP_LOAD)
2842             return false;
2843          if (ld->src(0).isIndirect(0))
2844             return false;
2845          file = ld->src(0).getFile();
2846       } else {
2847          file = insn->src(s).getFile();
2848          // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
2849          // in register "units", which can vary between targets.
2850          if (file == FILE_GPR) {
2851             Value *v = insn->getSrc(s);
2852             int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
2853             int units = bytes >> gpr_unit;
2854             if (units > prog->maxGPR)
2855                file = FILE_IMMEDIATE;
2856          }
2857       }
2858       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
2859          return false;
2860    }
2861    return true;
2862 }
2863 
2864 void
removeFlow(Instruction * insn)2865 FlatteningPass::removeFlow(Instruction *insn)
2866 {
2867    FlowInstruction *term = insn ? insn->asFlow() : NULL;
2868    if (!term)
2869       return;
2870    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
2871 
2872    if (term->op == OP_BRA) {
2873       // TODO: this might get more difficult when we get arbitrary BRAs
2874       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
2875          return;
2876    } else
2877    if (term->op != OP_JOIN)
2878       return;
2879 
2880    Value *pred = term->getPredicate();
2881 
2882    delete_Instruction(prog, term);
2883 
2884    if (pred && pred->refCount() == 0) {
2885       Instruction *pSet = pred->getUniqueInsn();
2886       pred->join->reg.data.id = -1; // deallocate
2887       if (pSet->isDead())
2888          delete_Instruction(prog, pSet);
2889    }
2890 }
2891 
2892 void
predicateInstructions(BasicBlock * bb,Value * pred,CondCode cc)2893 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
2894 {
2895    for (Instruction *i = bb->getEntry(); i; i = i->next) {
2896       if (i->isNop())
2897          continue;
2898       assert(!i->getPredicate());
2899       i->setPredicate(cc, pred);
2900    }
2901    removeFlow(bb->getExit());
2902 }
2903 
2904 bool
mayPredicate(const Instruction * insn,const Value * pred) const2905 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
2906 {
2907    if (insn->isPseudo())
2908       return true;
2909    // TODO: calls where we don't know which registers are modified
2910 
2911    if (!prog->getTarget()->mayPredicate(insn, pred))
2912       return false;
2913    for (int d = 0; insn->defExists(d); ++d)
2914       if (insn->getDef(d)->equals(pred))
2915          return false;
2916    return true;
2917 }
2918 
2919 // If we jump to BRA/RET/EXIT, replace the jump with it.
2920 // NOTE: We do not update the CFG anymore here !
2921 //
2922 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
2923 //  BB:0
2924 //   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
2925 //  BB1:
2926 //   bra BB:3
2927 //  BB2:
2928 //   ...
2929 //  BB3:
2930 //   ...
2931 void
tryPropagateBranch(BasicBlock * bb)2932 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
2933 {
2934    for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
2935       BasicBlock *bf = i->asFlow()->target.bb;
2936 
2937       if (bf->getInsnCount() != 1)
2938          continue;
2939 
2940       FlowInstruction *bra = i->asFlow();
2941       FlowInstruction *rep = bf->getExit()->asFlow();
2942 
2943       if (!rep || rep->getPredicate())
2944          continue;
2945       if (rep->op != OP_BRA &&
2946           rep->op != OP_JOIN &&
2947           rep->op != OP_EXIT)
2948          continue;
2949 
2950       // TODO: If there are multiple branches to @rep, only the first would
2951       // be replaced, so only remove them after this pass is done ?
2952       // Also, need to check all incident blocks for fall-through exits and
2953       // add the branch there.
2954       bra->op = rep->op;
2955       bra->target.bb = rep->target.bb;
2956       if (bf->cfg.incidentCount() == 1)
2957          bf->remove(rep);
2958    }
2959 }
2960 
2961 bool
visit(Function * fn)2962 FlatteningPass::visit(Function *fn)
2963 {
2964    gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
2965 
2966    return true;
2967 }
2968 
2969 bool
visit(BasicBlock * bb)2970 FlatteningPass::visit(BasicBlock *bb)
2971 {
2972    if (tryPredicateConditional(bb))
2973       return true;
2974 
2975    // try to attach join to previous instruction
2976    if (prog->getTarget()->hasJoin) {
2977       Instruction *insn = bb->getExit();
2978       if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
2979          insn = insn->prev;
2980          if (insn && !insn->getPredicate() &&
2981              !insn->asFlow() &&
2982              insn->op != OP_DISCARD &&
2983              insn->op != OP_TEXBAR &&
2984              !isTextureOp(insn->op) && // probably just nve4
2985              !isSurfaceOp(insn->op) && // not confirmed
2986              insn->op != OP_LINTERP && // probably just nve4
2987              insn->op != OP_PINTERP && // probably just nve4
2988              ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
2989               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
2990              !insn->isNop()) {
2991             insn->join = 1;
2992             bb->remove(bb->getExit());
2993             return true;
2994          }
2995       }
2996    }
2997 
2998    tryPropagateBranch(bb);
2999 
3000    return true;
3001 }
3002 
3003 bool
tryPredicateConditional(BasicBlock * bb)3004 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
3005 {
3006    BasicBlock *bL = NULL, *bR = NULL;
3007    unsigned int nL = 0, nR = 0, limit = 12;
3008    Instruction *insn;
3009    unsigned int mask;
3010 
3011    mask = bb->initiatesSimpleConditional();
3012    if (!mask)
3013       return false;
3014 
3015    assert(bb->getExit());
3016    Value *pred = bb->getExit()->getPredicate();
3017    assert(pred);
3018 
3019    if (isConstantCondition(pred))
3020       limit = 4;
3021 
3022    Graph::EdgeIterator ei = bb->cfg.outgoing();
3023 
3024    if (mask & 1) {
3025       bL = BasicBlock::get(ei.getNode());
3026       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
3027          if (!mayPredicate(insn, pred))
3028             return false;
3029       if (nL > limit)
3030          return false; // too long, do a real branch
3031    }
3032    ei.next();
3033 
3034    if (mask & 2) {
3035       bR = BasicBlock::get(ei.getNode());
3036       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
3037          if (!mayPredicate(insn, pred))
3038             return false;
3039       if (nR > limit)
3040          return false; // too long, do a real branch
3041    }
3042 
3043    if (bL)
3044       predicateInstructions(bL, pred, bb->getExit()->cc);
3045    if (bR)
3046       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
3047 
3048    if (bb->joinAt) {
3049       bb->remove(bb->joinAt);
3050       bb->joinAt = NULL;
3051    }
3052    removeFlow(bb->getExit()); // delete the branch/join at the fork point
3053 
3054    // remove potential join operations at the end of the conditional
3055    if (prog->getTarget()->joinAnterior) {
3056       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
3057       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
3058          removeFlow(bb->getEntry());
3059    }
3060 
3061    return true;
3062 }
3063 
3064 // =============================================================================
3065 
3066 // Fold Immediate into MAD; must be done after register allocation due to
3067 // constraint SDST == SSRC2
3068 // TODO:
3069 // Does NVC0+ have other situations where this pass makes sense?
3070 class NV50PostRaConstantFolding : public Pass
3071 {
3072 private:
3073    virtual bool visit(BasicBlock *);
3074 };
3075 
3076 static bool
post_ra_dead(Instruction * i)3077 post_ra_dead(Instruction *i)
3078 {
3079    for (int d = 0; i->defExists(d); ++d)
3080       if (i->getDef(d)->refCount())
3081          return false;
3082    return true;
3083 }
3084 
3085 bool
visit(BasicBlock * bb)3086 NV50PostRaConstantFolding::visit(BasicBlock *bb)
3087 {
3088    Value *vtmp;
3089    Instruction *def;
3090 
3091    for (Instruction *i = bb->getFirst(); i; i = i->next) {
3092       switch (i->op) {
3093       case OP_MAD:
3094          if (i->def(0).getFile() != FILE_GPR ||
3095              i->src(0).getFile() != FILE_GPR ||
3096              i->src(1).getFile() != FILE_GPR ||
3097              i->src(2).getFile() != FILE_GPR ||
3098              i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3099             break;
3100 
3101          if (i->getDef(0)->reg.data.id >= 64 ||
3102              i->getSrc(0)->reg.data.id >= 64)
3103             break;
3104 
3105          if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
3106             break;
3107 
3108          if (i->getPredicate())
3109             break;
3110 
3111          def = i->getSrc(1)->getInsn();
3112          if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
3113             def = def->getSrc(0)->getInsn();
3114          if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
3115             vtmp = i->getSrc(1);
3116             if (isFloatType(i->sType)) {
3117                i->setSrc(1, def->getSrc(0));
3118             } else {
3119                ImmediateValue val;
3120                bool ret = def->src(0).getImmediate(val);
3121                assert(ret);
3122                if (i->getSrc(1)->reg.data.id & 1)
3123                   val.reg.data.u32 >>= 16;
3124                val.reg.data.u32 &= 0xffff;
3125                i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
3126             }
3127 
3128             /* There's no post-RA dead code elimination, so do it here
3129              * XXX: if we add more code-removing post-RA passes, we might
3130              *      want to create a post-RA dead-code elim pass */
3131             if (post_ra_dead(vtmp->getInsn())) {
3132                Value *src = vtmp->getInsn()->getSrc(0);
3133                // Careful -- splits will have already been removed from the
3134                // functions. Don't double-delete.
3135                if (vtmp->getInsn()->bb)
3136                   delete_Instruction(prog, vtmp->getInsn());
3137                if (src->getInsn() && post_ra_dead(src->getInsn()))
3138                   delete_Instruction(prog, src->getInsn());
3139             }
3140 
3141             break;
3142          }
3143          break;
3144       default:
3145          break;
3146       }
3147    }
3148 
3149    return true;
3150 }
3151 
3152 // =============================================================================
3153 
3154 // Common subexpression elimination. Stupid O^2 implementation.
3155 class LocalCSE : public Pass
3156 {
3157 private:
3158    virtual bool visit(BasicBlock *);
3159 
3160    inline bool tryReplace(Instruction **, Instruction *);
3161 
3162    DLList ops[OP_LAST + 1];
3163 };
3164 
3165 class GlobalCSE : public Pass
3166 {
3167 private:
3168    virtual bool visit(BasicBlock *);
3169 };
3170 
3171 bool
isActionEqual(const Instruction * that) const3172 Instruction::isActionEqual(const Instruction *that) const
3173 {
3174    if (this->op != that->op ||
3175        this->dType != that->dType ||
3176        this->sType != that->sType)
3177       return false;
3178    if (this->cc != that->cc)
3179       return false;
3180 
3181    if (this->asTex()) {
3182       if (memcmp(&this->asTex()->tex,
3183                  &that->asTex()->tex,
3184                  sizeof(this->asTex()->tex)))
3185          return false;
3186    } else
3187    if (this->asCmp()) {
3188       if (this->asCmp()->setCond != that->asCmp()->setCond)
3189          return false;
3190    } else
3191    if (this->asFlow()) {
3192       return false;
3193    } else {
3194       if (this->ipa != that->ipa ||
3195           this->lanes != that->lanes ||
3196           this->perPatch != that->perPatch)
3197          return false;
3198       if (this->postFactor != that->postFactor)
3199          return false;
3200    }
3201 
3202    if (this->subOp != that->subOp ||
3203        this->saturate != that->saturate ||
3204        this->rnd != that->rnd ||
3205        this->ftz != that->ftz ||
3206        this->dnz != that->dnz ||
3207        this->cache != that->cache ||
3208        this->mask != that->mask)
3209       return false;
3210 
3211    return true;
3212 }
3213 
3214 bool
isResultEqual(const Instruction * that) const3215 Instruction::isResultEqual(const Instruction *that) const
3216 {
3217    unsigned int d, s;
3218 
3219    // NOTE: location of discard only affects tex with liveOnly and quadops
3220    if (!this->defExists(0) && this->op != OP_DISCARD)
3221       return false;
3222 
3223    if (!isActionEqual(that))
3224       return false;
3225 
3226    if (this->predSrc != that->predSrc)
3227       return false;
3228 
3229    for (d = 0; this->defExists(d); ++d) {
3230       if (!that->defExists(d) ||
3231           !this->getDef(d)->equals(that->getDef(d), false))
3232          return false;
3233    }
3234    if (that->defExists(d))
3235       return false;
3236 
3237    for (s = 0; this->srcExists(s); ++s) {
3238       if (!that->srcExists(s))
3239          return false;
3240       if (this->src(s).mod != that->src(s).mod)
3241          return false;
3242       if (!this->getSrc(s)->equals(that->getSrc(s), true))
3243          return false;
3244    }
3245    if (that->srcExists(s))
3246       return false;
3247 
3248    if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
3249       switch (src(0).getFile()) {
3250       case FILE_MEMORY_CONST:
3251       case FILE_SHADER_INPUT:
3252          return true;
3253       case FILE_SHADER_OUTPUT:
3254          return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
3255       default:
3256          return false;
3257       }
3258    }
3259 
3260    return true;
3261 }
3262 
3263 // pull through common expressions from different in-blocks
3264 bool
visit(BasicBlock * bb)3265 GlobalCSE::visit(BasicBlock *bb)
3266 {
3267    Instruction *phi, *next, *ik;
3268    int s;
3269 
3270    // TODO: maybe do this with OP_UNION, too
3271 
3272    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
3273       next = phi->next;
3274       if (phi->getSrc(0)->refCount() > 1)
3275          continue;
3276       ik = phi->getSrc(0)->getInsn();
3277       if (!ik)
3278          continue; // probably a function input
3279       if (ik->defCount(0xff) > 1)
3280          continue; // too painful to check if we can really push this forward
3281       for (s = 1; phi->srcExists(s); ++s) {
3282          if (phi->getSrc(s)->refCount() > 1)
3283             break;
3284          if (!phi->getSrc(s)->getInsn() ||
3285              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
3286             break;
3287       }
3288       if (!phi->srcExists(s)) {
3289          Instruction *entry = bb->getEntry();
3290          ik->bb->remove(ik);
3291          if (!entry || entry->op != OP_JOIN)
3292             bb->insertHead(ik);
3293          else
3294             bb->insertAfter(entry, ik);
3295          ik->setDef(0, phi->getDef(0));
3296          delete_Instruction(prog, phi);
3297       }
3298    }
3299 
3300    return true;
3301 }
3302 
3303 bool
tryReplace(Instruction ** ptr,Instruction * i)3304 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
3305 {
3306    Instruction *old = *ptr;
3307 
3308    // TODO: maybe relax this later (causes trouble with OP_UNION)
3309    if (i->isPredicated())
3310       return false;
3311 
3312    if (!old->isResultEqual(i))
3313       return false;
3314 
3315    for (int d = 0; old->defExists(d); ++d)
3316       old->def(d).replace(i->getDef(d), false);
3317    delete_Instruction(prog, old);
3318    *ptr = NULL;
3319    return true;
3320 }
3321 
3322 bool
visit(BasicBlock * bb)3323 LocalCSE::visit(BasicBlock *bb)
3324 {
3325    unsigned int replaced;
3326 
3327    do {
3328       Instruction *ir, *next;
3329 
3330       replaced = 0;
3331 
3332       // will need to know the order of instructions
3333       int serial = 0;
3334       for (ir = bb->getFirst(); ir; ir = ir->next)
3335          ir->serial = serial++;
3336 
3337       for (ir = bb->getFirst(); ir; ir = next) {
3338          int s;
3339          Value *src = NULL;
3340 
3341          next = ir->next;
3342 
3343          if (ir->fixed) {
3344             ops[ir->op].insert(ir);
3345             continue;
3346          }
3347 
3348          for (s = 0; ir->srcExists(s); ++s)
3349             if (ir->getSrc(s)->asLValue())
3350                if (!src || ir->getSrc(s)->refCount() < src->refCount())
3351                   src = ir->getSrc(s);
3352 
3353          if (src) {
3354             for (Value::UseIterator it = src->uses.begin();
3355                  it != src->uses.end(); ++it) {
3356                Instruction *ik = (*it)->getInsn();
3357                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3358                   if (tryReplace(&ir, ik))
3359                      break;
3360             }
3361          } else {
3362             DLLIST_FOR_EACH(&ops[ir->op], iter)
3363             {
3364                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3365                if (tryReplace(&ir, ik))
3366                   break;
3367             }
3368          }
3369 
3370          if (ir)
3371             ops[ir->op].insert(ir);
3372          else
3373             ++replaced;
3374       }
3375       for (unsigned int i = 0; i <= OP_LAST; ++i)
3376          ops[i].clear();
3377 
3378    } while (replaced);
3379 
3380    return true;
3381 }
3382 
3383 // =============================================================================
3384 
3385 // Remove computations of unused values.
3386 class DeadCodeElim : public Pass
3387 {
3388 public:
3389    bool buryAll(Program *);
3390 
3391 private:
3392    virtual bool visit(BasicBlock *);
3393 
3394    void checkSplitLoad(Instruction *ld); // for partially dead loads
3395 
3396    unsigned int deadCount;
3397 };
3398 
3399 bool
buryAll(Program * prog)3400 DeadCodeElim::buryAll(Program *prog)
3401 {
3402    do {
3403       deadCount = 0;
3404       if (!this->run(prog, false, false))
3405          return false;
3406    } while (deadCount);
3407 
3408    return true;
3409 }
3410 
3411 bool
visit(BasicBlock * bb)3412 DeadCodeElim::visit(BasicBlock *bb)
3413 {
3414    Instruction *prev;
3415 
3416    for (Instruction *i = bb->getExit(); i; i = prev) {
3417       prev = i->prev;
3418       if (i->isDead()) {
3419          ++deadCount;
3420          delete_Instruction(prog, i);
3421       } else
3422       if (i->defExists(1) &&
3423           i->subOp == 0 &&
3424           (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3425          checkSplitLoad(i);
3426       } else
3427       if (i->defExists(0) && !i->getDef(0)->refCount()) {
3428          if (i->op == OP_ATOM ||
3429              i->op == OP_SUREDP ||
3430              i->op == OP_SUREDB) {
3431             i->setDef(0, NULL);
3432          } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3433             i->setDef(0, i->getDef(1));
3434             i->setDef(1, NULL);
3435          }
3436       }
3437    }
3438    return true;
3439 }
3440 
3441 // Each load can go into up to 4 destinations, any of which might potentially
3442 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3443 // of where the holes are. We find the first contiguous region, put it into
3444 // the first load, and then put the second contiguous region into the second
3445 // load. There can be at most 2 contiguous regions.
3446 //
3447 // Note that there are some restrictions, for example it's not possible to do
3448 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3449 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3450 // split into a 64-bit and 32-bit load.
3451 void
checkSplitLoad(Instruction * ld1)3452 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3453 {
3454    Instruction *ld2 = NULL; // can get at most 2 loads
3455    Value *def1[4];
3456    Value *def2[4];
3457    int32_t addr1, addr2;
3458    int32_t size1, size2;
3459    int d, n1, n2;
3460    uint32_t mask = 0xffffffff;
3461 
3462    for (d = 0; ld1->defExists(d); ++d)
3463       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
3464          mask &= ~(1 << d);
3465    if (mask == 0xffffffff)
3466       return;
3467 
3468    addr1 = ld1->getSrc(0)->reg.data.offset;
3469    n1 = n2 = 0;
3470    size1 = size2 = 0;
3471 
3472    // Compute address/width for first load
3473    for (d = 0; ld1->defExists(d); ++d) {
3474       if (mask & (1 << d)) {
3475          if (size1 && (addr1 & 0x7))
3476             break;
3477          def1[n1] = ld1->getDef(d);
3478          size1 += def1[n1++]->reg.size;
3479       } else
3480       if (!n1) {
3481          addr1 += ld1->getDef(d)->reg.size;
3482       } else {
3483          break;
3484       }
3485    }
3486 
3487    // Scale back the size of the first load until it can be loaded. This
3488    // typically happens for TYPE_B96 loads.
3489    while (n1 &&
3490           !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
3491                                                 typeOfSize(size1))) {
3492       size1 -= def1[--n1]->reg.size;
3493       d--;
3494    }
3495 
3496    // Compute address/width for second load
3497    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
3498       if (mask & (1 << d)) {
3499          assert(!size2 || !(addr2 & 0x7));
3500          def2[n2] = ld1->getDef(d);
3501          size2 += def2[n2++]->reg.size;
3502       } else if (!n2) {
3503          assert(!n2);
3504          addr2 += ld1->getDef(d)->reg.size;
3505       } else {
3506          break;
3507       }
3508    }
3509 
3510    // Make sure that we've processed all the values
3511    for (; ld1->defExists(d); ++d)
3512       assert(!(mask & (1 << d)));
3513 
3514    updateLdStOffset(ld1, addr1, func);
3515    ld1->setType(typeOfSize(size1));
3516    for (d = 0; d < 4; ++d)
3517       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
3518 
3519    if (!n2)
3520       return;
3521 
3522    ld2 = cloneShallow(func, ld1);
3523    updateLdStOffset(ld2, addr2, func);
3524    ld2->setType(typeOfSize(size2));
3525    for (d = 0; d < 4; ++d)
3526       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
3527 
3528    ld1->bb->insertAfter(ld1, ld2);
3529 }
3530 
3531 // =============================================================================
3532 
3533 #define RUN_PASS(l, n, f)                       \
3534    if (level >= (l)) {                          \
3535       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
3536          INFO("PEEPHOLE: %s\n", #n);            \
3537       n pass;                                   \
3538       if (!pass.f(this))                        \
3539          return false;                          \
3540    }
3541 
3542 bool
optimizeSSA(int level)3543 Program::optimizeSSA(int level)
3544 {
3545    RUN_PASS(1, DeadCodeElim, buryAll);
3546    RUN_PASS(1, CopyPropagation, run);
3547    RUN_PASS(1, MergeSplits, run);
3548    RUN_PASS(2, GlobalCSE, run);
3549    RUN_PASS(1, LocalCSE, run);
3550    RUN_PASS(2, AlgebraicOpt, run);
3551    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
3552    RUN_PASS(1, ConstantFolding, foldAll);
3553    RUN_PASS(2, LateAlgebraicOpt, run);
3554    RUN_PASS(1, LoadPropagation, run);
3555    RUN_PASS(1, IndirectPropagation, run);
3556    RUN_PASS(2, MemoryOpt, run);
3557    RUN_PASS(2, LocalCSE, run);
3558    RUN_PASS(0, DeadCodeElim, buryAll);
3559 
3560    return true;
3561 }
3562 
3563 bool
optimizePostRA(int level)3564 Program::optimizePostRA(int level)
3565 {
3566    RUN_PASS(2, FlatteningPass, run);
3567    if (getTarget()->getChipset() < 0xc0)
3568       RUN_PASS(2, NV50PostRaConstantFolding, run);
3569 
3570    return true;
3571 }
3572 
3573 }
3574