• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
25 
26 #include "codegen/nv50_ir_target_nvc0.h"
27 #include "codegen/nv50_ir_lowering_nvc0.h"
28 
29 #include <limits>
30 
31 namespace nv50_ir {
32 
33 #define QOP_ADD  0
34 #define QOP_SUBR 1
35 #define QOP_SUB  2
36 #define QOP_MOV2 3
37 
38 //             UL UR LL LR
39 #define QUADOP(q, r, s, t)                      \
40    ((QOP_##q << 6) | (QOP_##r << 4) |           \
41     (QOP_##s << 2) | (QOP_##t << 0))
42 
43 void
handleDIV(Instruction * i)44 NVC0LegalizeSSA::handleDIV(Instruction *i)
45 {
46    FlowInstruction *call;
47    int builtin;
48    Value *def[2];
49 
50    bld.setPosition(i, false);
51    def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
52    def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
53    switch (i->dType) {
54    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
55    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
56    default:
57       return;
58    }
59    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
60    bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
61    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
62    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
63 
64    call->fixed = 1;
65    call->absolute = call->builtin = 1;
66    call->target.builtin = builtin;
67    delete_Instruction(prog, i);
68 }
69 
70 void
handleRCPRSQ(Instruction * i)71 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
72 {
73    assert(i->dType == TYPE_F64);
74    // There are instructions that will compute the high 32 bits of the 64-bit
75    // float. We will just stick 0 in the bottom 32 bits.
76 
77    bld.setPosition(i, false);
78 
79    // 1. Take the source and it up.
80    Value *src[2], *dst[2], *def = i->getDef(0);
81    bld.mkSplit(src, 4, i->getSrc(0));
82 
83    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
84    dst[0] = bld.loadImm(NULL, 0);
85    dst[1] = bld.getSSA();
86 
87    // 3. The new version of the instruction takes the high 32 bits of the
88    // source and outputs the high 32 bits of the destination.
89    i->setSrc(0, src[1]);
90    i->setDef(0, dst[1]);
91    i->setType(TYPE_F32);
92    i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
93 
94    // 4. Recombine the two dst pieces back into the original destination.
95    bld.setPosition(i, true);
96    bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
97 }
98 
99 void
handleFTZ(Instruction * i)100 NVC0LegalizeSSA::handleFTZ(Instruction *i)
101 {
102    // Only want to flush float inputs
103    assert(i->sType == TYPE_F32);
104 
105    // If we're already flushing denorms (and NaN's) to zero, no need for this.
106    if (i->dnz)
107       return;
108 
109    // Only certain classes of operations can flush
110    OpClass cls = prog->getTarget()->getOpClass(i->op);
111    if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
112        cls != OPCLASS_CONVERT)
113       return;
114 
115    i->ftz = true;
116 }
117 
118 void
handleTEXLOD(TexInstruction * i)119 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
120 {
121    if (i->tex.levelZero)
122       return;
123 
124    ImmediateValue lod;
125 
126    // The LOD argument comes right after the coordinates (before depth bias,
127    // offsets, etc).
128    int arg = i->tex.target.getArgCount();
129 
130    // SM30+ stores the indirect handle as a separate arg, which comes before
131    // the LOD.
132    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
133        i->tex.rIndirectSrc >= 0)
134       arg++;
135    // SM20 stores indirect handle combined with array coordinate
136    if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
137        !i->tex.target.isArray() &&
138        i->tex.rIndirectSrc >= 0)
139       arg++;
140 
141    if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
142       return;
143 
144    if (i->op == OP_TXL)
145       i->op = OP_TEX;
146    i->tex.levelZero = true;
147    i->moveSources(arg + 1, -1);
148 }
149 
150 bool
visit(Function * fn)151 NVC0LegalizeSSA::visit(Function *fn)
152 {
153    bld.setProgram(fn->getProgram());
154    return true;
155 }
156 
157 bool
visit(BasicBlock * bb)158 NVC0LegalizeSSA::visit(BasicBlock *bb)
159 {
160    Instruction *next;
161    for (Instruction *i = bb->getEntry(); i; i = next) {
162       next = i->next;
163 
164       if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
165          handleFTZ(i);
166 
167       switch (i->op) {
168       case OP_DIV:
169       case OP_MOD:
170          if (i->sType != TYPE_F32)
171             handleDIV(i);
172          break;
173       case OP_RCP:
174       case OP_RSQ:
175          if (i->dType == TYPE_F64)
176             handleRCPRSQ(i);
177          break;
178       case OP_TXL:
179       case OP_TXF:
180          handleTEXLOD(i->asTex());
181          break;
182       default:
183          break;
184       }
185    }
186    return true;
187 }
188 
NVC0LegalizePostRA(const Program * prog)189 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
190    : rZero(NULL),
191      carry(NULL),
192      pOne(NULL),
193      needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
194                 prog->getTarget()->getChipset() < 0x110)
195 {
196 }
197 
198 bool
insnDominatedBy(const Instruction * later,const Instruction * early) const199 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
200                                     const Instruction *early) const
201 {
202    if (early->bb == later->bb)
203       return early->serial < later->serial;
204    return later->bb->dominatedBy(early->bb);
205 }
206 
207 void
addTexUse(std::list<TexUse> & uses,Instruction * usei,const Instruction * texi)208 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
209                               Instruction *usei, const Instruction *texi)
210 {
211    bool add = true;
212    bool dominated = insnDominatedBy(usei, texi);
213    // Uses before the tex have to all be included. Just because an earlier
214    // instruction dominates another instruction doesn't mean that there's no
215    // way to get from the tex to the later instruction. For example you could
216    // have nested loops, with the tex in the inner loop, and uses before it in
217    // both loops - even though the outer loop's instruction would dominate the
218    // inner's, we still want a texbar before the inner loop's instruction.
219    //
220    // However we can still use the eliding logic between uses dominated by the
221    // tex instruction, as that is unambiguously correct.
222    if (dominated) {
223       for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
224          if (it->after) {
225             if (insnDominatedBy(usei, it->insn)) {
226                add = false;
227                break;
228             }
229             if (insnDominatedBy(it->insn, usei)) {
230                it = uses.erase(it);
231                continue;
232             }
233          }
234          ++it;
235       }
236    }
237    if (add)
238       uses.push_back(TexUse(usei, texi, dominated));
239 }
240 
241 // While it might be tempting to use the an algorithm that just looks at tex
242 // uses, not all texture results are guaranteed to be used on all paths. In
243 // the case where along some control flow path a texture result is never used,
244 // we might reuse that register for something else, creating a
245 // write-after-write hazard. So we have to manually look through all
246 // instructions looking for ones that reference the registers in question.
247 void
findFirstUses(Instruction * texi,std::list<TexUse> & uses)248 NVC0LegalizePostRA::findFirstUses(
249    Instruction *texi, std::list<TexUse> &uses)
250 {
251    int minGPR = texi->def(0).rep()->reg.data.id;
252    int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
253 
254    unordered_set<const BasicBlock *> visited;
255    findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
256 }
257 
258 void
findFirstUsesBB(int minGPR,int maxGPR,Instruction * start,const Instruction * texi,std::list<TexUse> & uses,unordered_set<const BasicBlock * > & visited)259 NVC0LegalizePostRA::findFirstUsesBB(
260    int minGPR, int maxGPR, Instruction *start,
261    const Instruction *texi, std::list<TexUse> &uses,
262    unordered_set<const BasicBlock *> &visited)
263 {
264    const BasicBlock *bb = start->bb;
265 
266    // We don't process the whole bb the first time around. This is correct,
267    // however we might be in a loop and hit this BB again, and need to process
268    // the full thing. So only mark a bb as visited if we processed it from the
269    // beginning.
270    if (start == bb->getEntry()) {
271       if (visited.find(bb) != visited.end())
272          return;
273       visited.insert(bb);
274    }
275 
276    for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
277       if (insn->isNop())
278          continue;
279 
280       for (int d = 0; insn->defExists(d); ++d) {
281          const Value *def = insn->def(d).rep();
282          if (insn->def(d).getFile() != FILE_GPR ||
283              def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
284              def->reg.data.id > maxGPR)
285             continue;
286          addTexUse(uses, insn, texi);
287          return;
288       }
289 
290       for (int s = 0; insn->srcExists(s); ++s) {
291          const Value *src = insn->src(s).rep();
292          if (insn->src(s).getFile() != FILE_GPR ||
293              src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
294              src->reg.data.id > maxGPR)
295             continue;
296          addTexUse(uses, insn, texi);
297          return;
298       }
299    }
300 
301    for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
302       findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
303                       texi, uses, visited);
304    }
305 }
306 
307 // Texture barriers:
308 // This pass is a bit long and ugly and can probably be optimized.
309 //
310 // 1. obtain a list of TEXes and their outputs' first use(s)
311 // 2. calculate the barrier level of each first use (minimal number of TEXes,
312 //    over all paths, between the TEX and the use in question)
313 // 3. for each barrier, if all paths from the source TEX to that barrier
314 //    contain a barrier of lesser level, it can be culled
315 bool
insertTextureBarriers(Function * fn)316 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
317 {
318    std::list<TexUse> *uses;
319    std::vector<Instruction *> texes;
320    std::vector<int> bbFirstTex;
321    std::vector<int> bbFirstUse;
322    std::vector<int> texCounts;
323    std::vector<TexUse> useVec;
324    ArrayList insns;
325 
326    fn->orderInstructions(insns);
327 
328    texCounts.resize(fn->allBBlocks.getSize(), 0);
329    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
330    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
331 
332    // tag BB CFG nodes by their id for later
333    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
334       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
335       if (bb)
336          bb->cfg.tag = bb->getId();
337    }
338 
339    // gather the first uses for each TEX
340    for (int i = 0; i < insns.getSize(); ++i) {
341       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
342       if (isTextureOp(tex->op)) {
343          texes.push_back(tex);
344          if (!texCounts.at(tex->bb->getId()))
345             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
346          texCounts[tex->bb->getId()]++;
347       }
348    }
349    insns.clear();
350    if (texes.empty())
351       return false;
352    uses = new std::list<TexUse>[texes.size()];
353    if (!uses)
354       return false;
355    for (size_t i = 0; i < texes.size(); ++i) {
356       findFirstUses(texes[i], uses[i]);
357    }
358 
359    // determine the barrier level at each use
360    for (size_t i = 0; i < texes.size(); ++i) {
361       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
362            ++u) {
363          BasicBlock *tb = texes[i]->bb;
364          BasicBlock *ub = u->insn->bb;
365          if (tb == ub) {
366             u->level = 0;
367             for (size_t j = i + 1; j < texes.size() &&
368                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
369                  ++j)
370                u->level++;
371          } else {
372             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
373                                                       &ub->cfg, texCounts);
374             if (u->level < 0) {
375                WARN("Failed to find path TEX -> TEXBAR\n");
376                u->level = 0;
377                continue;
378             }
379             // this counted all TEXes in the origin block, correct that
380             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
381             // and did not count the TEXes in the destination block, add those
382             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
383                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
384                  ++j)
385                u->level++;
386          }
387          assert(u->level >= 0);
388          useVec.push_back(*u);
389       }
390    }
391    delete[] uses;
392 
393    // insert the barriers
394    for (size_t i = 0; i < useVec.size(); ++i) {
395       Instruction *prev = useVec[i].insn->prev;
396       if (useVec[i].level < 0)
397          continue;
398       if (prev && prev->op == OP_TEXBAR) {
399          if (prev->subOp > useVec[i].level)
400             prev->subOp = useVec[i].level;
401          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
402       } else {
403          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
404          bar->fixed = 1;
405          bar->subOp = useVec[i].level;
406          // make use explicit to ease latency calculation
407          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
408          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
409       }
410    }
411 
412    if (fn->getProgram()->optLevel < 3)
413       return true;
414 
415    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
416 
417    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
418    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
419    limitS.resize(fn->allBBlocks.getSize());
420 
421    // cull unneeded barriers (should do that earlier, but for simplicity)
422    IteratorRef bi = fn->cfg.iteratorCFG();
423    // first calculate min/max outstanding TEXes for each BB
424    for (bi->reset(); !bi->end(); bi->next()) {
425       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
426       BasicBlock *bb = BasicBlock::get(n);
427       int min = 0;
428       int max = std::numeric_limits<int>::max();
429       for (Instruction *i = bb->getFirst(); i; i = i->next) {
430          if (isTextureOp(i->op)) {
431             min++;
432             if (max < std::numeric_limits<int>::max())
433                max++;
434          } else
435          if (i->op == OP_TEXBAR) {
436             min = MIN2(min, i->subOp);
437             max = MIN2(max, i->subOp);
438          }
439       }
440       // limits when looking at an isolated block
441       limitS[bb->getId()].min = min;
442       limitS[bb->getId()].max = max;
443    }
444    // propagate the min/max values
445    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
446       for (bi->reset(); !bi->end(); bi->next()) {
447          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
448          BasicBlock *bb = BasicBlock::get(n);
449          const int bbId = bb->getId();
450          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
451             BasicBlock *in = BasicBlock::get(ei.getNode());
452             const int inId = in->getId();
453             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
454             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
455          }
456          // I just hope this is correct ...
457          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
458             // no barrier
459             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
460             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
461          } else {
462             // block contained a barrier
463             limitB[bbId].min = MIN2(limitS[bbId].max,
464                                     limitT[bbId].min + limitS[bbId].min);
465             limitB[bbId].max = MIN2(limitS[bbId].max,
466                                     limitT[bbId].max + limitS[bbId].min);
467          }
468       }
469    }
470    // finally delete unnecessary barriers
471    for (bi->reset(); !bi->end(); bi->next()) {
472       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
473       BasicBlock *bb = BasicBlock::get(n);
474       Instruction *prev = NULL;
475       Instruction *next;
476       int max = limitT[bb->getId()].max;
477       for (Instruction *i = bb->getFirst(); i; i = next) {
478          next = i->next;
479          if (i->op == OP_TEXBAR) {
480             if (i->subOp >= max) {
481                delete_Instruction(prog, i);
482                i = NULL;
483             } else {
484                max = i->subOp;
485                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
486                   delete_Instruction(prog, prev);
487                   prev = NULL;
488                }
489             }
490          } else
491          if (isTextureOp(i->op)) {
492             max++;
493          }
494          if (i && !i->isNop())
495             prev = i;
496       }
497    }
498    return true;
499 }
500 
501 bool
visit(Function * fn)502 NVC0LegalizePostRA::visit(Function *fn)
503 {
504    if (needTexBar)
505       insertTextureBarriers(fn);
506 
507    rZero = new_LValue(fn, FILE_GPR);
508    pOne = new_LValue(fn, FILE_PREDICATE);
509    carry = new_LValue(fn, FILE_FLAGS);
510 
511    rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
512    carry->reg.data.id = 0;
513    pOne->reg.data.id = 7;
514 
515    return true;
516 }
517 
518 void
replaceZero(Instruction * i)519 NVC0LegalizePostRA::replaceZero(Instruction *i)
520 {
521    for (int s = 0; i->srcExists(s); ++s) {
522       if (s == 2 && i->op == OP_SUCLAMP)
523          continue;
524       ImmediateValue *imm = i->getSrc(s)->asImm();
525       if (imm) {
526          if (i->op == OP_SELP && s == 2) {
527             i->setSrc(s, pOne);
528             if (imm->reg.data.u64 == 0)
529                i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
530          } else if (imm->reg.data.u64 == 0) {
531             i->setSrc(s, rZero);
532          }
533       }
534    }
535 }
536 
537 // replace CONT with BRA for single unconditional continue
538 bool
tryReplaceContWithBra(BasicBlock * bb)539 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
540 {
541    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
542       return false;
543    Graph::EdgeIterator ei = bb->cfg.incident();
544    if (ei.getType() != Graph::Edge::BACK)
545       ei.next();
546    if (ei.getType() != Graph::Edge::BACK)
547       return false;
548    BasicBlock *contBB = BasicBlock::get(ei.getNode());
549 
550    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
551        contBB->getExit()->getPredicate())
552       return false;
553    contBB->getExit()->op = OP_BRA;
554    bb->remove(bb->getEntry()); // delete PRECONT
555 
556    ei.next();
557    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
558    return true;
559 }
560 
561 // replace branches to join blocks with join ops
562 void
propagateJoin(BasicBlock * bb)563 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
564 {
565    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
566       return;
567    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
568       BasicBlock *in = BasicBlock::get(ei.getNode());
569       Instruction *exit = in->getExit();
570       if (!exit) {
571          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
572          // there should always be a terminator instruction
573          WARN("inserted missing terminator in BB:%i\n", in->getId());
574       } else
575       if (exit->op == OP_BRA) {
576          exit->op = OP_JOIN;
577          exit->asFlow()->limit = 1; // must-not-propagate marker
578       }
579    }
580    bb->remove(bb->getEntry());
581 }
582 
583 bool
visit(BasicBlock * bb)584 NVC0LegalizePostRA::visit(BasicBlock *bb)
585 {
586    Instruction *i, *next;
587 
588    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
589    for (i = bb->getFirst(); i; i = next) {
590       next = i->next;
591       if (i->op == OP_EMIT || i->op == OP_RESTART) {
592          if (!i->getDef(0)->refCount())
593             i->setDef(0, NULL);
594          if (i->src(0).getFile() == FILE_IMMEDIATE)
595             i->setSrc(0, rZero); // initial value must be 0
596          replaceZero(i);
597       } else
598       if (i->isNop()) {
599          bb->remove(i);
600       } else
601       if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
602           prog->getType() != Program::TYPE_COMPUTE) {
603          // It seems like barriers are never required for tessellation since
604          // the warp size is 32, and there are always at most 32 tcs threads.
605          bb->remove(i);
606       } else
607       if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
608          int offset = i->src(0).get()->reg.data.offset;
609          if (abs(offset) > 0x10000)
610             i->src(0).get()->reg.fileIndex += offset >> 16;
611          i->src(0).get()->reg.data.offset = (int)(short)offset;
612       } else {
613          // TODO: Move this to before register allocation for operations that
614          // need the $c register !
615          if (typeSizeof(i->dType) == 8) {
616             Instruction *hi;
617             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
618             if (hi)
619                next = hi;
620          }
621 
622          if (i->op != OP_MOV && i->op != OP_PFETCH)
623             replaceZero(i);
624       }
625    }
626    if (!bb->getEntry())
627       return true;
628 
629    if (!tryReplaceContWithBra(bb))
630       propagateJoin(bb);
631 
632    return true;
633 }
634 
NVC0LoweringPass(Program * prog)635 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
636 {
637    bld.setProgram(prog);
638 }
639 
640 bool
visit(Function * fn)641 NVC0LoweringPass::visit(Function *fn)
642 {
643    if (prog->getType() == Program::TYPE_GEOMETRY) {
644       assert(!strncmp(fn->getName(), "MAIN", 4));
645       // TODO: when we generate actual functions pass this value along somehow
646       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
647       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
648       if (fn->cfgExit) {
649          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
650          bld.mkMovToReg(0, gpEmitAddress);
651       }
652    }
653    return true;
654 }
655 
656 bool
visit(BasicBlock * bb)657 NVC0LoweringPass::visit(BasicBlock *bb)
658 {
659    return true;
660 }
661 
662 inline Value *
loadTexHandle(Value * ptr,unsigned int slot)663 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
664 {
665    uint8_t b = prog->driver->io.auxCBSlot;
666    uint32_t off = prog->driver->io.texBindBase + slot * 4;
667 
668    if (ptr)
669       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
670 
671    return bld.
672       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
673 }
674 
675 // move array source to first slot, convert to u16, add indirections
676 bool
handleTEX(TexInstruction * i)677 NVC0LoweringPass::handleTEX(TexInstruction *i)
678 {
679    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
680    const int arg = i->tex.target.getArgCount();
681    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
682    const int chipset = prog->getTarget()->getChipset();
683 
684    /* Only normalize in the non-explicit derivatives case. For explicit
685     * derivatives, this is handled in handleManualTXD.
686     */
687    if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
688       Value *src[3], *val;
689       int c;
690       for (c = 0; c < 3; ++c)
691          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
692       val = bld.getScratch();
693       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
694       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
695       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
696       for (c = 0; c < 3; ++c) {
697          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
698                                  i->getSrc(c), val));
699       }
700    }
701 
702    // Arguments to the TEX instruction are a little insane. Even though the
703    // encoding is identical between SM20 and SM30, the arguments mean
704    // different things between Fermi and Kepler+. A lot of arguments are
705    // optional based on flags passed to the instruction. This summarizes the
706    // order of things.
707    //
708    // Fermi:
709    //  array/indirect
710    //  coords
711    //  sample
712    //  lod bias
713    //  depth compare
714    //  offsets:
715    //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
716    //    - other: 4 bits each, single reg
717    //
718    // Kepler+:
719    //  indirect handle
720    //  array (+ offsets for txd in upper 16 bits)
721    //  coords
722    //  sample
723    //  lod bias
724    //  depth compare
725    //  offsets (same as fermi, except txd which takes it with array)
726    //
727    // Maxwell (tex):
728    //  array
729    //  coords
730    //  indirect handle
731    //  sample
732    //  lod bias
733    //  depth compare
734    //  offsets
735    //
736    // Maxwell (txd):
737    //  indirect handle
738    //  coords
739    //  array + offsets
740    //  derivatives
741 
742    if (chipset >= NVISA_GK104_CHIPSET) {
743       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
744          // XXX this ignores tsc, and assumes a 1:1 mapping
745          assert(i->tex.rIndirectSrc >= 0);
746          Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
747          i->tex.r = 0xff;
748          i->tex.s = 0x1f;
749          i->setIndirectR(hnd);
750          i->setIndirectS(NULL);
751       } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
752          if (i->tex.r == 0xffff)
753             i->tex.r = prog->driver->io.fbtexBindBase / 4;
754          else
755             i->tex.r += prog->driver->io.texBindBase / 4;
756          i->tex.s  = 0; // only a single cX[] value possible here
757       } else {
758          Value *hnd = bld.getScratch();
759          Value *rHnd = loadTexHandle(NULL, i->tex.r);
760          Value *sHnd = loadTexHandle(NULL, i->tex.s);
761 
762          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
763 
764          i->tex.r = 0; // not used for indirect tex
765          i->tex.s = 0;
766          i->setIndirectR(hnd);
767       }
768       if (i->tex.target.isArray()) {
769          LValue *layer = new_LValue(func, FILE_GPR);
770          Value *src = i->getSrc(lyr);
771          const int sat = (i->op == OP_TXF) ? 1 : 0;
772          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
773          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
774          if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
775             for (int s = dim; s >= 1; --s)
776                i->setSrc(s, i->getSrc(s - 1));
777             i->setSrc(0, layer);
778          } else {
779             i->setSrc(dim, layer);
780          }
781       }
782       // Move the indirect reference to the first place
783       if (i->tex.rIndirectSrc >= 0 && (
784                 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
785          Value *hnd = i->getIndirectR();
786 
787          i->setIndirectR(NULL);
788          i->moveSources(0, 1);
789          i->setSrc(0, hnd);
790          i->tex.rIndirectSrc = 0;
791          i->tex.sIndirectSrc = -1;
792       }
793       // Move the indirect reference to right after the coords
794       else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
795          Value *hnd = i->getIndirectR();
796 
797          i->setIndirectR(NULL);
798          i->moveSources(arg, 1);
799          i->setSrc(arg, hnd);
800          i->tex.rIndirectSrc = 0;
801          i->tex.sIndirectSrc = -1;
802       }
803    } else
804    // (nvc0) generate and move the tsc/tic/array source to the front
805    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
806       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
807 
808       Value *ticRel = i->getIndirectR();
809       Value *tscRel = i->getIndirectS();
810 
811       if (i->tex.r == 0xffff) {
812          i->tex.r = 0x20;
813          i->tex.s = 0x10;
814       }
815 
816       if (ticRel) {
817          i->setSrc(i->tex.rIndirectSrc, NULL);
818          if (i->tex.r)
819             ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
820                                 ticRel, bld.mkImm(i->tex.r));
821       }
822       if (tscRel) {
823          i->setSrc(i->tex.sIndirectSrc, NULL);
824          if (i->tex.s)
825             tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
826                                 tscRel, bld.mkImm(i->tex.s));
827       }
828 
829       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
830       if (arrayIndex) {
831          for (int s = dim; s >= 1; --s)
832             i->setSrc(s, i->getSrc(s - 1));
833          i->setSrc(0, arrayIndex);
834       } else {
835          i->moveSources(0, 1);
836       }
837 
838       if (arrayIndex) {
839          int sat = (i->op == OP_TXF) ? 1 : 0;
840          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
841          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
842       } else {
843          bld.loadImm(src, 0);
844       }
845 
846       if (ticRel)
847          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
848       if (tscRel)
849          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
850 
851       i->setSrc(0, src);
852    }
853 
854    // For nvc0, the sample id has to be in the second operand, as the offset
855    // does. Right now we don't know how to pass both in, and this case can't
856    // happen with OpenGL. On nve0, the sample id is part of the texture
857    // coordinate argument.
858    assert(chipset >= NVISA_GK104_CHIPSET ||
859           !i->tex.useOffsets || !i->tex.target.isMS());
860 
861    // offset is between lod and dc
862    if (i->tex.useOffsets) {
863       int n, c;
864       int s = i->srcCount(0xff, true);
865       if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
866          if (i->tex.target.isShadow())
867             s--;
868          if (i->srcExists(s)) // move potential predicate out of the way
869             i->moveSources(s, 1);
870          if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
871             i->moveSources(s + 1, 1);
872       }
873       if (i->op == OP_TXG) {
874          // Either there is 1 offset, which goes into the 2 low bytes of the
875          // first source, or there are 4 offsets, which go into 2 sources (8
876          // values, 1 byte each).
877          Value *offs[2] = {NULL, NULL};
878          for (n = 0; n < i->tex.useOffsets; n++) {
879             for (c = 0; c < 2; ++c) {
880                if ((n % 2) == 0 && c == 0)
881                   bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
882                else
883                   bld.mkOp3(OP_INSBF, TYPE_U32,
884                             offs[n / 2],
885                             i->offset[n][c].get(),
886                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
887                             offs[n / 2]);
888             }
889          }
890          i->setSrc(s, offs[0]);
891          if (offs[1])
892             i->setSrc(s + 1, offs[1]);
893       } else {
894          unsigned imm = 0;
895          assert(i->tex.useOffsets == 1);
896          for (c = 0; c < 3; ++c) {
897             ImmediateValue val;
898             if (!i->offset[0][c].getImmediate(val))
899                assert(!"non-immediate offset passed to non-TXG");
900             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
901          }
902          if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
903             // The offset goes into the upper 16 bits of the array index. So
904             // create it if it's not already there, and INSBF it if it already
905             // is.
906             s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
907             if (chipset >= NVISA_GM107_CHIPSET)
908                s += dim;
909             if (i->tex.target.isArray()) {
910                bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
911                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
912                          i->getSrc(s));
913             } else {
914                i->moveSources(s, 1);
915                i->setSrc(s, bld.loadImm(NULL, imm << 16));
916             }
917          } else {
918             i->setSrc(s, bld.loadImm(NULL, imm));
919          }
920       }
921    }
922 
923    if (chipset >= NVISA_GK104_CHIPSET) {
924       //
925       // If TEX requires more than 4 sources, the 2nd register tuple must be
926       // aligned to 4, even if it consists of just a single 4-byte register.
927       //
928       // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
929       //
930       int s = i->srcCount(0xff, true);
931       if (s > 4 && s < 7) {
932          if (i->srcExists(s)) // move potential predicate out of the way
933             i->moveSources(s, 7 - s);
934          while (s < 7)
935             i->setSrc(s++, bld.loadImm(NULL, 0));
936       }
937    }
938 
939    return true;
940 }
941 
942 bool
handleManualTXD(TexInstruction * i)943 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
944 {
945    static const uint8_t qOps[4][2] =
946    {
947       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
948       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
949       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
950       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
951    };
952    Value *def[4][4];
953    Value *crd[3];
954    Instruction *tex;
955    Value *zero = bld.loadImm(bld.getSSA(), 0);
956    int l, c;
957    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
958 
959    // This function is invoked after handleTEX lowering, so we have to expect
960    // the arguments in the order that the hw wants them. For Fermi, array and
961    // indirect are both in the leading arg, while for Kepler, array and
962    // indirect are separate (and both precede the coordinates). Maxwell is
963    // handled in a separate function.
964    unsigned array;
965    if (targ->getChipset() < NVISA_GK104_CHIPSET)
966       array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
967    else
968       array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
969 
970    i->op = OP_TEX; // no need to clone dPdx/dPdy later
971 
972    for (c = 0; c < dim; ++c)
973       crd[c] = bld.getScratch();
974 
975    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
976    for (l = 0; l < 4; ++l) {
977       Value *src[3], *val;
978       // mov coordinates from lane l to all lanes
979       for (c = 0; c < dim; ++c)
980          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
981       // add dPdx from lane l to lanes dx
982       for (c = 0; c < dim; ++c)
983          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
984       // add dPdy from lane l to lanes dy
985       for (c = 0; c < dim; ++c)
986          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
987       // normalize cube coordinates
988       if (i->tex.target.isCube()) {
989          for (c = 0; c < 3; ++c)
990             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
991          val = bld.getScratch();
992          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
993          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
994          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
995          for (c = 0; c < 3; ++c)
996             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
997       } else {
998          for (c = 0; c < dim; ++c)
999             src[c] = crd[c];
1000       }
1001       // texture
1002       bld.insert(tex = cloneForward(func, i));
1003       for (c = 0; c < dim; ++c)
1004          tex->setSrc(c + array, src[c]);
1005       // save results
1006       for (c = 0; i->defExists(c); ++c) {
1007          Instruction *mov;
1008          def[c][l] = bld.getSSA();
1009          mov = bld.mkMov(def[c][l], tex->getDef(c));
1010          mov->fixed = 1;
1011          mov->lanes = 1 << l;
1012       }
1013    }
1014    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1015 
1016    for (c = 0; i->defExists(c); ++c) {
1017       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1018       for (l = 0; l < 4; ++l)
1019          u->setSrc(l, def[c][l]);
1020    }
1021 
1022    i->bb->remove(i);
1023    return true;
1024 }
1025 
1026 bool
handleTXD(TexInstruction * txd)1027 NVC0LoweringPass::handleTXD(TexInstruction *txd)
1028 {
1029    int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1030    unsigned arg = txd->tex.target.getArgCount();
1031    unsigned expected_args = arg;
1032    const int chipset = prog->getTarget()->getChipset();
1033 
1034    if (chipset >= NVISA_GK104_CHIPSET) {
1035       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1036          expected_args++;
1037       if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1038          expected_args++;
1039    } else {
1040       if (txd->tex.useOffsets)
1041          expected_args++;
1042       if (!txd->tex.target.isArray() && (
1043                 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1044          expected_args++;
1045    }
1046 
1047    if (expected_args > 4 ||
1048        dim > 2 ||
1049        txd->tex.target.isShadow())
1050       txd->op = OP_TEX;
1051 
1052    handleTEX(txd);
1053    while (txd->srcExists(arg))
1054       ++arg;
1055 
1056    txd->tex.derivAll = true;
1057    if (txd->op == OP_TEX)
1058       return handleManualTXD(txd);
1059 
1060    assert(arg == expected_args);
1061    for (int c = 0; c < dim; ++c) {
1062       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1063       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1064       txd->dPdx[c].set(NULL);
1065       txd->dPdy[c].set(NULL);
1066    }
1067 
1068    // In this case we have fewer than 4 "real" arguments, which means that
1069    // handleTEX didn't apply any padding. However we have to make sure that
1070    // the second "group" of arguments still gets padded up to 4.
1071    if (chipset >= NVISA_GK104_CHIPSET) {
1072       int s = arg + 2 * dim;
1073       if (s >= 4 && s < 7) {
1074          if (txd->srcExists(s)) // move potential predicate out of the way
1075             txd->moveSources(s, 7 - s);
1076          while (s < 7)
1077             txd->setSrc(s++, bld.loadImm(NULL, 0));
1078       }
1079    }
1080 
1081    return true;
1082 }
1083 
1084 bool
handleTXQ(TexInstruction * txq)1085 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1086 {
1087    const int chipset = prog->getTarget()->getChipset();
1088    if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1089       txq->tex.r += prog->driver->io.texBindBase / 4;
1090 
1091    if (txq->tex.rIndirectSrc < 0)
1092       return true;
1093 
1094    Value *ticRel = txq->getIndirectR();
1095 
1096    txq->setIndirectS(NULL);
1097    txq->tex.sIndirectSrc = -1;
1098 
1099    assert(ticRel);
1100 
1101    if (chipset < NVISA_GK104_CHIPSET) {
1102       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1103 
1104       txq->setSrc(txq->tex.rIndirectSrc, NULL);
1105       if (txq->tex.r)
1106          ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1107                              ticRel, bld.mkImm(txq->tex.r));
1108 
1109       bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1110 
1111       txq->moveSources(0, 1);
1112       txq->setSrc(0, src);
1113    } else {
1114       Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1115       txq->tex.r = 0xff;
1116       txq->tex.s = 0x1f;
1117 
1118       txq->setIndirectR(NULL);
1119       txq->moveSources(0, 1);
1120       txq->setSrc(0, hnd);
1121       txq->tex.rIndirectSrc = 0;
1122    }
1123 
1124    return true;
1125 }
1126 
1127 bool
handleTXLQ(TexInstruction * i)1128 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1129 {
1130    /* The outputs are inverted compared to what the TGSI instruction
1131     * expects. Take that into account in the mask.
1132     */
1133    assert((i->tex.mask & ~3) == 0);
1134    if (i->tex.mask == 1)
1135       i->tex.mask = 2;
1136    else if (i->tex.mask == 2)
1137       i->tex.mask = 1;
1138    handleTEX(i);
1139    bld.setPosition(i, true);
1140 
1141    /* The returned values are not quite what we want:
1142     * (a) convert from s16/u16 to f32
1143     * (b) multiply by 1/256
1144     */
1145    for (int def = 0; def < 2; ++def) {
1146       if (!i->defExists(def))
1147          continue;
1148       enum DataType type = TYPE_S16;
1149       if (i->tex.mask == 2 || def > 0)
1150          type = TYPE_U16;
1151       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1152       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1153                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1154    }
1155    if (i->tex.mask == 3) {
1156       LValue *t = new_LValue(func, FILE_GPR);
1157       bld.mkMov(t, i->getDef(0));
1158       bld.mkMov(i->getDef(0), i->getDef(1));
1159       bld.mkMov(i->getDef(1), t);
1160    }
1161    return true;
1162 }
1163 
1164 bool
handleBUFQ(Instruction * bufq)1165 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1166 {
1167    bufq->op = OP_MOV;
1168    bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1169                                    bufq->getSrc(0)->reg.fileIndex * 16));
1170    bufq->setIndirect(0, 0, NULL);
1171    bufq->setIndirect(0, 1, NULL);
1172    return true;
1173 }
1174 
1175 void
handleSharedATOMNVE4(Instruction * atom)1176 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1177 {
1178    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1179 
1180    BasicBlock *currBB = atom->bb;
1181    BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1182    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1183    BasicBlock *setAndUnlockBB = new BasicBlock(func);
1184    BasicBlock *failLockBB = new BasicBlock(func);
1185 
1186    bld.setPosition(currBB, true);
1187    assert(!currBB->joinAt);
1188    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1189 
1190    CmpInstruction *pred =
1191       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1192                 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1193 
1194    bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1195    currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1196 
1197    bld.setPosition(tryLockBB, true);
1198 
1199    Instruction *ld =
1200       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1201                  atom->getIndirect(0, 0));
1202    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1203    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1204 
1205    bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1206    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1207    tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1208    tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1209 
1210    tryLockBB->cfg.detach(&joinBB->cfg);
1211    bld.remove(atom);
1212 
1213    bld.setPosition(setAndUnlockBB, true);
1214    Value *stVal;
1215    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1216       // Read the old value, and write the new one.
1217       stVal = atom->getSrc(1);
1218    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1219       CmpInstruction *set =
1220          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1221                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1222 
1223       bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1224                 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1225    } else {
1226       operation op;
1227 
1228       switch (atom->subOp) {
1229       case NV50_IR_SUBOP_ATOM_ADD:
1230          op = OP_ADD;
1231          break;
1232       case NV50_IR_SUBOP_ATOM_AND:
1233          op = OP_AND;
1234          break;
1235       case NV50_IR_SUBOP_ATOM_OR:
1236          op = OP_OR;
1237          break;
1238       case NV50_IR_SUBOP_ATOM_XOR:
1239          op = OP_XOR;
1240          break;
1241       case NV50_IR_SUBOP_ATOM_MIN:
1242          op = OP_MIN;
1243          break;
1244       case NV50_IR_SUBOP_ATOM_MAX:
1245          op = OP_MAX;
1246          break;
1247       default:
1248          assert(0);
1249          return;
1250       }
1251 
1252       stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1253                          atom->getSrc(1));
1254    }
1255 
1256    Instruction *st =
1257       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1258                   atom->getIndirect(0, 0), stVal);
1259    st->setDef(0, pred->getDef(0));
1260    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1261 
1262    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1263    setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1264 
1265    // Lock until the store has not been performed.
1266    bld.setPosition(failLockBB, true);
1267    bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1268    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1269    failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1270    failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1271 
1272    bld.setPosition(joinBB, false);
1273    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1274 }
1275 
1276 void
handleSharedATOM(Instruction * atom)1277 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1278 {
1279    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1280 
1281    BasicBlock *currBB = atom->bb;
1282    BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1283    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1284 
1285    bld.setPosition(currBB, true);
1286    assert(!currBB->joinAt);
1287    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1288 
1289    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1290    currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1291 
1292    bld.setPosition(tryLockAndSetBB, true);
1293 
1294    Instruction *ld =
1295       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1296                  atom->getIndirect(0, 0));
1297    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1298    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1299 
1300    Value *stVal;
1301    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1302       // Read the old value, and write the new one.
1303       stVal = atom->getSrc(1);
1304    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1305       CmpInstruction *set =
1306          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1307                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1308       set->setPredicate(CC_P, ld->getDef(1));
1309 
1310       Instruction *selp =
1311          bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1312                    atom->getSrc(2), set->getDef(0));
1313       selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1314       selp->setPredicate(CC_P, ld->getDef(1));
1315 
1316       stVal = selp->getDef(0);
1317    } else {
1318       operation op;
1319 
1320       switch (atom->subOp) {
1321       case NV50_IR_SUBOP_ATOM_ADD:
1322          op = OP_ADD;
1323          break;
1324       case NV50_IR_SUBOP_ATOM_AND:
1325          op = OP_AND;
1326          break;
1327       case NV50_IR_SUBOP_ATOM_OR:
1328          op = OP_OR;
1329          break;
1330       case NV50_IR_SUBOP_ATOM_XOR:
1331          op = OP_XOR;
1332          break;
1333       case NV50_IR_SUBOP_ATOM_MIN:
1334          op = OP_MIN;
1335          break;
1336       case NV50_IR_SUBOP_ATOM_MAX:
1337          op = OP_MAX;
1338          break;
1339       default:
1340          assert(0);
1341          return;
1342       }
1343 
1344       Instruction *i =
1345          bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1346                    atom->getSrc(1));
1347       i->setPredicate(CC_P, ld->getDef(1));
1348 
1349       stVal = i->getDef(0);
1350    }
1351 
1352    Instruction *st =
1353       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1354                   atom->getIndirect(0, 0), stVal);
1355    st->setPredicate(CC_P, ld->getDef(1));
1356    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1357 
1358    // Loop until the lock is acquired.
1359    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1360    tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1361    tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1362    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1363 
1364    bld.remove(atom);
1365 
1366    bld.setPosition(joinBB, false);
1367    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1368 }
1369 
1370 bool
handleATOM(Instruction * atom)1371 NVC0LoweringPass::handleATOM(Instruction *atom)
1372 {
1373    SVSemantic sv;
1374    Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1375 
1376    switch (atom->src(0).getFile()) {
1377    case FILE_MEMORY_LOCAL:
1378       sv = SV_LBASE;
1379       break;
1380    case FILE_MEMORY_SHARED:
1381       // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1382       // operations on shared memory. For Maxwell, ATOMS is enough.
1383       if (targ->getChipset() < NVISA_GK104_CHIPSET)
1384          handleSharedATOM(atom);
1385       else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1386          handleSharedATOMNVE4(atom);
1387       return true;
1388    default:
1389       assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1390       base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1391       assert(base->reg.size == 8);
1392       if (ptr)
1393          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1394       assert(base->reg.size == 8);
1395       atom->setIndirect(0, 0, base);
1396       atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1397 
1398       // Harden against out-of-bounds accesses
1399       Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1400       Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1401       Value *pred = new_LValue(func, FILE_PREDICATE);
1402       if (ptr)
1403          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1404       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1405       atom->setPredicate(CC_NOT_P, pred);
1406       if (atom->defExists(0)) {
1407          Value *zero, *dst = atom->getDef(0);
1408          atom->setDef(0, bld.getSSA());
1409 
1410          bld.setPosition(atom, true);
1411          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1412             ->setPredicate(CC_P, pred);
1413          bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1414       }
1415 
1416       return true;
1417    }
1418    base =
1419       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1420 
1421    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1422    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1423    if (ptr)
1424       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1425    atom->setIndirect(0, 1, NULL);
1426    atom->setIndirect(0, 0, base);
1427 
1428    return true;
1429 }
1430 
1431 bool
handleCasExch(Instruction * cas,bool needCctl)1432 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1433 {
1434    if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1435       if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1436          // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1437          return false;
1438       }
1439    }
1440 
1441    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1442        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1443       return false;
1444    bld.setPosition(cas, true);
1445 
1446    if (needCctl) {
1447       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1448       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1449       cctl->fixed = 1;
1450       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1451       if (cas->isPredicated())
1452          cctl->setPredicate(cas->cc, cas->getPredicate());
1453    }
1454 
1455    if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1456       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1457       // should be set to the high part of the double reg or bad things will
1458       // happen elsewhere in the universe.
1459       // Also, it sometimes returns the new value instead of the old one
1460       // under mysterious circumstances.
1461       Value *dreg = bld.getSSA(8);
1462       bld.setPosition(cas, false);
1463       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
1464       cas->setSrc(1, dreg);
1465       cas->setSrc(2, dreg);
1466    }
1467 
1468    return true;
1469 }
1470 
1471 inline Value *
loadResInfo32(Value * ptr,uint32_t off,uint16_t base)1472 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1473 {
1474    uint8_t b = prog->driver->io.auxCBSlot;
1475    off += base;
1476 
1477    return bld.
1478       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1479 }
1480 
1481 inline Value *
loadResInfo64(Value * ptr,uint32_t off,uint16_t base)1482 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1483 {
1484    uint8_t b = prog->driver->io.auxCBSlot;
1485    off += base;
1486 
1487    if (ptr)
1488       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1489 
1490    return bld.
1491       mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1492 }
1493 
1494 inline Value *
loadResLength32(Value * ptr,uint32_t off,uint16_t base)1495 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1496 {
1497    uint8_t b = prog->driver->io.auxCBSlot;
1498    off += base;
1499 
1500    if (ptr)
1501       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1502 
1503    return bld.
1504       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1505 }
1506 
1507 inline Value *
loadBufInfo64(Value * ptr,uint32_t off)1508 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1509 {
1510    return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1511 }
1512 
1513 inline Value *
loadBufLength32(Value * ptr,uint32_t off)1514 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1515 {
1516    return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1517 }
1518 
1519 inline Value *
loadUboInfo64(Value * ptr,uint32_t off)1520 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1521 {
1522    return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1523 }
1524 
1525 inline Value *
loadUboLength32(Value * ptr,uint32_t off)1526 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1527 {
1528    return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1529 }
1530 
1531 inline Value *
loadMsInfo32(Value * ptr,uint32_t off)1532 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1533 {
1534    uint8_t b = prog->driver->io.msInfoCBSlot;
1535    off += prog->driver->io.msInfoBase;
1536    return bld.
1537       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1538 }
1539 
1540 /* On nvc0, surface info is obtained via the surface binding points passed
1541  * to the SULD/SUST instructions.
1542  * On nve4, surface info is stored in c[] and is used by various special
1543  * instructions, e.g. for clamping coordinates or generating an address.
1544  * They couldn't just have added an equivalent to TIC now, couldn't they ?
1545  */
1546 #define NVC0_SU_INFO_ADDR   0x00
1547 #define NVC0_SU_INFO_FMT    0x04
1548 #define NVC0_SU_INFO_DIM_X  0x08
1549 #define NVC0_SU_INFO_PITCH  0x0c
1550 #define NVC0_SU_INFO_DIM_Y  0x10
1551 #define NVC0_SU_INFO_ARRAY  0x14
1552 #define NVC0_SU_INFO_DIM_Z  0x18
1553 #define NVC0_SU_INFO_UNK1C  0x1c
1554 #define NVC0_SU_INFO_WIDTH  0x20
1555 #define NVC0_SU_INFO_HEIGHT 0x24
1556 #define NVC0_SU_INFO_DEPTH  0x28
1557 #define NVC0_SU_INFO_TARGET 0x2c
1558 #define NVC0_SU_INFO_BSIZE  0x30
1559 #define NVC0_SU_INFO_RAW_X  0x34
1560 #define NVC0_SU_INFO_MS_X   0x38
1561 #define NVC0_SU_INFO_MS_Y   0x3c
1562 
1563 #define NVC0_SU_INFO__STRIDE 0x40
1564 
1565 #define NVC0_SU_INFO_DIM(i)  (0x08 + (i) * 8)
1566 #define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
1567 #define NVC0_SU_INFO_MS(i)   (0x38 + (i) * 4)
1568 
1569 inline Value *
loadSuInfo32(Value * ptr,int slot,uint32_t off)1570 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off)
1571 {
1572    uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1573 
1574    if (ptr) {
1575       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1576       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1577       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1578       base = 0;
1579    }
1580    off += base;
1581 
1582    return loadResInfo32(ptr, off, prog->driver->io.suInfoBase);
1583 }
1584 
getSuClampSubOp(const TexInstruction * su,int c)1585 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1586 {
1587    switch (su->tex.target.getEnum()) {
1588    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1589    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1590    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1591    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1592                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1593                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1594    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1595    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1596    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1597    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1598    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1599    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1600    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1601    default:
1602       assert(0);
1603       return 0;
1604    }
1605 }
1606 
1607 bool
handleSUQ(TexInstruction * suq)1608 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1609 {
1610    int mask = suq->tex.mask;
1611    int dim = suq->tex.target.getDim();
1612    int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1613    Value *ind = suq->getIndirectR();
1614    int slot = suq->tex.r;
1615    int c, d;
1616 
1617    for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1618       if (c >= arg || !(mask & 1))
1619          continue;
1620 
1621       int offset;
1622 
1623       if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1624          offset = NVC0_SU_INFO_SIZE(2);
1625       } else {
1626          offset = NVC0_SU_INFO_SIZE(c);
1627       }
1628       bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset));
1629       if (c == 2 && suq->tex.target.isCube())
1630          bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1631                    bld.loadImm(NULL, 6));
1632    }
1633 
1634    if (mask & 1) {
1635       if (suq->tex.target.isMS()) {
1636          Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0));
1637          Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1));
1638          Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1639          bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1640       } else {
1641          bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1642       }
1643    }
1644 
1645    bld.remove(suq);
1646    return true;
1647 }
1648 
1649 void
adjustCoordinatesMS(TexInstruction * tex)1650 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1651 {
1652    const int arg = tex->tex.target.getArgCount();
1653    int slot = tex->tex.r;
1654 
1655    if (tex->tex.target == TEX_TARGET_2D_MS)
1656       tex->tex.target = TEX_TARGET_2D;
1657    else
1658    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1659       tex->tex.target = TEX_TARGET_2D_ARRAY;
1660    else
1661       return;
1662 
1663    Value *x = tex->getSrc(0);
1664    Value *y = tex->getSrc(1);
1665    Value *s = tex->getSrc(arg - 1);
1666 
1667    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1668    Value *ind = tex->getIndirectR();
1669 
1670    Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0));
1671    Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1));
1672 
1673    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1674    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1675 
1676    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1677    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1678 
1679    Value *dx = loadMsInfo32(ts, 0x0);
1680    Value *dy = loadMsInfo32(ts, 0x4);
1681 
1682    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1683    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1684 
1685    tex->setSrc(0, tx);
1686    tex->setSrc(1, ty);
1687    tex->moveSources(arg, -1);
1688 }
1689 
1690 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1691 // They're computed from the coordinates using the surface info in c[] space.
1692 void
processSurfaceCoordsNVE4(TexInstruction * su)1693 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1694 {
1695    Instruction *insn;
1696    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1697    const bool raw =
1698       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1699    const int slot = su->tex.r;
1700    const int dim = su->tex.target.getDim();
1701    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1702    int c;
1703    Value *zero = bld.mkImm(0);
1704    Value *p1 = NULL;
1705    Value *v;
1706    Value *src[3];
1707    Value *bf, *eau, *off;
1708    Value *addr, *pred;
1709    Value *ind = su->getIndirectR();
1710 
1711    off = bld.getScratch(4);
1712    bf = bld.getScratch(4);
1713    addr = bld.getSSA(8);
1714    pred = bld.getScratch(1, FILE_PREDICATE);
1715 
1716    bld.setPosition(su, false);
1717 
1718    adjustCoordinatesMS(su);
1719 
1720    // calculate clamped coordinates
1721    for (c = 0; c < arg; ++c) {
1722       int dimc = c;
1723 
1724       if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
1725          // The array index is stored in the Z component for 1D arrays.
1726          dimc = 2;
1727       }
1728 
1729       src[c] = bld.getScratch();
1730       if (c == 0 && raw)
1731          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X);
1732       else
1733          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc));
1734       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
1735          ->subOp = getSuClampSubOp(su, dimc);
1736    }
1737    for (; c < 3; ++c)
1738       src[c] = zero;
1739 
1740    // set predicate output
1741    if (su->tex.target == TEX_TARGET_BUFFER) {
1742       src[0]->getInsn()->setFlagsDef(1, pred);
1743    } else
1744    if (su->tex.target.isArray() || su->tex.target.isCube()) {
1745       p1 = bld.getSSA(1, FILE_PREDICATE);
1746       src[dim]->getInsn()->setFlagsDef(1, p1);
1747    }
1748 
1749    // calculate pixel offset
1750    if (dim == 1) {
1751       if (su->tex.target != TEX_TARGET_BUFFER)
1752          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
1753    } else
1754    if (dim == 3) {
1755       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C);
1756       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
1757          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1758 
1759       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH);
1760       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
1761          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1762    } else {
1763       assert(dim == 2);
1764       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH);
1765       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
1766          ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
1767          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1768    }
1769 
1770    // calculate effective address part 1
1771    if (su->tex.target == TEX_TARGET_BUFFER) {
1772       if (raw) {
1773          bf = src[0];
1774       } else {
1775          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT);
1776          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
1777             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
1778       }
1779    } else {
1780       Value *y = src[1];
1781       Value *z = src[2];
1782       uint16_t subOp = 0;
1783 
1784       switch (dim) {
1785       case 1:
1786          y = zero;
1787          z = zero;
1788          break;
1789       case 2:
1790          z = off;
1791          if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
1792             z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C);
1793             subOp = NV50_IR_SUBOP_SUBFM_3D;
1794          }
1795          break;
1796       default:
1797          subOp = NV50_IR_SUBOP_SUBFM_3D;
1798          assert(dim == 3);
1799          break;
1800       }
1801       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
1802       insn->subOp = subOp;
1803       insn->setFlagsDef(1, pred);
1804    }
1805 
1806    // part 2
1807    v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR);
1808 
1809    if (su->tex.target == TEX_TARGET_BUFFER) {
1810       eau = v;
1811    } else {
1812       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
1813    }
1814    // add array layer offset
1815    if (su->tex.target.isArray() || su->tex.target.isCube()) {
1816       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY);
1817       if (dim == 1)
1818          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
1819             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1820       else
1821          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
1822             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1823       // combine predicates
1824       assert(p1);
1825       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
1826    }
1827 
1828    if (atom) {
1829       Value *lo = bf;
1830       if (su->tex.target == TEX_TARGET_BUFFER) {
1831          lo = zero;
1832          bld.mkMov(off, bf);
1833       }
1834       //  bf == g[] address & 0xff
1835       // eau == g[] address >> 8
1836       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
1837       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
1838    } else
1839    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
1840       // Convert from u32 to u8 address format, which is what the library code
1841       // doing SULDP currently uses.
1842       // XXX: can SUEAU do this ?
1843       // XXX: does it matter that we don't mask high bytes in bf ?
1844       // Grrr.
1845       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
1846       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
1847    }
1848 
1849    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
1850 
1851    if (atom && su->tex.target == TEX_TARGET_BUFFER)
1852       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
1853 
1854    // let's just set it 0 for raw access and hope it works
1855    v = raw ?
1856       bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT);
1857 
1858    // get rid of old coordinate sources, make space for fmt info and predicate
1859    su->moveSources(arg, 3 - arg);
1860    // set 64 bit address and 32-bit format sources
1861    su->setSrc(0, addr);
1862    su->setSrc(1, v);
1863    su->setSrc(2, pred);
1864 
1865    // prevent read fault when the image is not actually bound
1866    CmpInstruction *pred1 =
1867       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1868                 TYPE_U32, bld.mkImm(0),
1869                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
1870 
1871    if (su->op != OP_SUSTP && su->tex.format) {
1872       const TexInstruction::ImgFormatDesc *format = su->tex.format;
1873       int blockwidth = format->bits[0] + format->bits[1] +
1874                        format->bits[2] + format->bits[3];
1875 
1876       // make sure that the format doesn't mismatch
1877       assert(format->components != 0);
1878       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
1879                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
1880                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
1881                 pred1->getDef(0));
1882    }
1883    su->setPredicate(CC_NOT_P, pred1->getDef(0));
1884 
1885    // TODO: initialize def values to 0 when the surface operation is not
1886    // performed (not needed for stores). Also, fix the "address bounds test"
1887    // subtests from arb_shader_image_load_store-invalid for buffers, because it
1888    // seems like that the predicate is not correctly set by suclamp.
1889 }
1890 
1891 static DataType
getSrcType(const TexInstruction::ImgFormatDesc * t,int c)1892 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
1893 {
1894    switch (t->type) {
1895    case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
1896    case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
1897    case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
1898    case UINT:
1899       return (t->bits[c] == 8 ? TYPE_U8 :
1900               (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
1901    case SINT:
1902       return (t->bits[c] == 8 ? TYPE_S8 :
1903               (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
1904    }
1905    return TYPE_NONE;
1906 }
1907 
1908 static DataType
getDestType(const ImgType type)1909 getDestType(const ImgType type) {
1910    switch (type) {
1911    case FLOAT:
1912    case UNORM:
1913    case SNORM:
1914       return TYPE_F32;
1915    case UINT:
1916       return TYPE_U32;
1917    case SINT:
1918       return TYPE_S32;
1919    default:
1920       assert(!"Impossible type");
1921       return TYPE_NONE;
1922    }
1923 }
1924 
1925 void
convertSurfaceFormat(TexInstruction * su)1926 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
1927 {
1928    const TexInstruction::ImgFormatDesc *format = su->tex.format;
1929    int width = format->bits[0] + format->bits[1] +
1930       format->bits[2] + format->bits[3];
1931    Value *untypedDst[4] = {};
1932    Value *typedDst[4] = {};
1933 
1934    // We must convert this to a generic load.
1935    su->op = OP_SULDB;
1936 
1937    su->dType = typeOfSize(width / 8);
1938    su->sType = TYPE_U8;
1939 
1940    for (int i = 0; i < width / 32; i++)
1941       untypedDst[i] = bld.getSSA();
1942    if (width < 32)
1943       untypedDst[0] = bld.getSSA();
1944 
1945    for (int i = 0; i < 4; i++) {
1946       typedDst[i] = su->getDef(i);
1947    }
1948 
1949    // Set the untyped dsts as the su's destinations
1950    for (int i = 0; i < 4; i++)
1951       su->setDef(i, untypedDst[i]);
1952 
1953    bld.setPosition(su, true);
1954 
1955    // Unpack each component into the typed dsts
1956    int bits = 0;
1957    for (int i = 0; i < 4; bits += format->bits[i], i++) {
1958       if (!typedDst[i])
1959          continue;
1960       if (i >= format->components) {
1961          if (format->type == FLOAT ||
1962              format->type == UNORM ||
1963              format->type == SNORM)
1964             bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
1965          else
1966             bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
1967          continue;
1968       }
1969 
1970       // Get just that component's data into the relevant place
1971       if (format->bits[i] == 32)
1972          bld.mkMov(typedDst[i], untypedDst[i]);
1973       else if (format->bits[i] == 16)
1974          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
1975                    getSrcType(format, i), untypedDst[i / 2])
1976          ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
1977       else if (format->bits[i] == 8)
1978          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
1979                    getSrcType(format, i), untypedDst[0])->subOp = i;
1980       else {
1981          bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
1982                    bld.mkImm((bits % 32) | (format->bits[i] << 8)));
1983          if (format->type == UNORM || format->type == SNORM)
1984             bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
1985       }
1986 
1987       // Normalize / convert as necessary
1988       if (format->type == UNORM)
1989          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
1990       else if (format->type == SNORM)
1991          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
1992       else if (format->type == FLOAT && format->bits[i] < 16) {
1993          bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
1994          bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
1995       }
1996    }
1997 
1998    if (format->bgra) {
1999       std::swap(typedDst[0], typedDst[2]);
2000    }
2001 }
2002 
2003 void
handleSurfaceOpNVE4(TexInstruction * su)2004 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2005 {
2006    processSurfaceCoordsNVE4(su);
2007 
2008    if (su->op == OP_SULDP)
2009       convertSurfaceFormat(su);
2010 
2011    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2012       assert(su->getPredicate());
2013       Value *pred =
2014          bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2015                     su->getPredicate(), su->getSrc(2));
2016 
2017       Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2018       red->subOp = su->subOp;
2019       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2020       red->setSrc(1, su->getSrc(3));
2021       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2022          red->setSrc(2, su->getSrc(4));
2023       red->setIndirect(0, 0, su->getSrc(0));
2024 
2025       // make sure to initialize dst value when the atomic operation is not
2026       // performed
2027       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2028 
2029       assert(su->cc == CC_NOT_P);
2030       red->setPredicate(su->cc, pred);
2031       mov->setPredicate(CC_P, pred);
2032 
2033       bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2034                 red->getDef(0), mov->getDef(0));
2035 
2036       delete_Instruction(bld.getProgram(), su);
2037       handleCasExch(red, true);
2038    }
2039 
2040    if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2041       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2042 }
2043 
2044 void
processSurfaceCoordsNVC0(TexInstruction * su)2045 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2046 {
2047    const int slot = su->tex.r;
2048    const int dim = su->tex.target.getDim();
2049    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2050    int c;
2051    Value *zero = bld.mkImm(0);
2052    Value *src[3];
2053    Value *v;
2054    Value *ind = su->getIndirectR();
2055 
2056    bld.setPosition(su, false);
2057 
2058    adjustCoordinatesMS(su);
2059 
2060    if (ind) {
2061       Value *ptr;
2062       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2063       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2064       su->setIndirectR(ptr);
2065    }
2066 
2067    // get surface coordinates
2068    for (c = 0; c < arg; ++c)
2069       src[c] = su->getSrc(c);
2070    for (; c < 3; ++c)
2071       src[c] = zero;
2072 
2073    // calculate pixel offset
2074    if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2075       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE);
2076       su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
2077    }
2078 
2079    // add array layer offset
2080    if (su->tex.target.isArray() || su->tex.target.isCube()) {
2081       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY);
2082       assert(dim > 1);
2083       su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
2084    }
2085 
2086    // prevent read fault when the image is not actually bound
2087    CmpInstruction *pred =
2088       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2089                 TYPE_U32, bld.mkImm(0),
2090                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
2091    if (su->op != OP_SUSTP && su->tex.format) {
2092       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2093       int blockwidth = format->bits[0] + format->bits[1] +
2094                        format->bits[2] + format->bits[3];
2095 
2096       assert(format->components != 0);
2097       // make sure that the format doesn't mismatch when it's not FMT_NONE
2098       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2099                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2100                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
2101                 pred->getDef(0));
2102    }
2103    su->setPredicate(CC_NOT_P, pred->getDef(0));
2104 }
2105 
2106 void
handleSurfaceOpNVC0(TexInstruction * su)2107 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2108 {
2109    if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2110       /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2111        * will simplify the lowering pass and the texture constraints. */
2112       su->moveSources(1, 1);
2113       su->setSrc(1, bld.loadImm(NULL, 0));
2114       su->tex.target = TEX_TARGET_2D_ARRAY;
2115    }
2116 
2117    processSurfaceCoordsNVC0(su);
2118 
2119    if (su->op == OP_SULDP)
2120       convertSurfaceFormat(su);
2121 
2122    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2123       const int dim = su->tex.target.getDim();
2124       const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2125       LValue *addr = bld.getSSA(8);
2126       Value *def = su->getDef(0);
2127 
2128       su->op = OP_SULEA;
2129 
2130       // Set the destination to the address
2131       su->dType = TYPE_U64;
2132       su->setDef(0, addr);
2133       su->setDef(1, su->getPredicate());
2134 
2135       bld.setPosition(su, true);
2136 
2137       // Perform the atomic op
2138       Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2139       red->subOp = su->subOp;
2140       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2141       red->setSrc(1, su->getSrc(arg));
2142       if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2143          red->setSrc(2, su->getSrc(arg + 1));
2144       red->setIndirect(0, 0, addr);
2145 
2146       // make sure to initialize dst value when the atomic operation is not
2147       // performed
2148       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2149 
2150       assert(su->cc == CC_NOT_P);
2151       red->setPredicate(su->cc, su->getPredicate());
2152       mov->setPredicate(CC_P, su->getPredicate());
2153 
2154       bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2155 
2156       handleCasExch(red, false);
2157    }
2158 }
2159 
2160 void
processSurfaceCoordsGM107(TexInstruction * su)2161 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
2162 {
2163    const int slot = su->tex.r;
2164    const int dim = su->tex.target.getDim();
2165    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2166    Value *ind = su->getIndirectR();
2167    int pos = 0;
2168 
2169    bld.setPosition(su, false);
2170 
2171    // add texture handle
2172    switch (su->op) {
2173    case OP_SUSTP:
2174       pos = 4;
2175       break;
2176    case OP_SUREDP:
2177       pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2178       break;
2179    default:
2180       assert(pos == 0);
2181       break;
2182    }
2183    su->setSrc(arg + pos, loadTexHandle(ind, slot + 32));
2184 
2185    // prevent read fault when the image is not actually bound
2186    CmpInstruction *pred =
2187       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2188                 TYPE_U32, bld.mkImm(0),
2189                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
2190    if (su->op != OP_SUSTP && su->tex.format) {
2191       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2192       int blockwidth = format->bits[0] + format->bits[1] +
2193                        format->bits[2] + format->bits[3];
2194 
2195       assert(format->components != 0);
2196       // make sure that the format doesn't mismatch when it's not FMT_NONE
2197       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2198                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2199                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
2200                 pred->getDef(0));
2201    }
2202    su->setPredicate(CC_NOT_P, pred->getDef(0));
2203 }
2204 
2205 void
handleSurfaceOpGM107(TexInstruction * su)2206 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2207 {
2208    processSurfaceCoordsGM107(su);
2209 
2210    if (su->op == OP_SULDP)
2211       convertSurfaceFormat(su);
2212 
2213    if (su->op == OP_SUREDP) {
2214       Value *def = su->getDef(0);
2215 
2216       su->op = OP_SUREDB;
2217       su->setDef(0, bld.getSSA());
2218 
2219       bld.setPosition(su, true);
2220 
2221       // make sure to initialize dst value when the atomic operation is not
2222       // performed
2223       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2224 
2225       assert(su->cc == CC_NOT_P);
2226       mov->setPredicate(CC_P, su->getPredicate());
2227 
2228       bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
2229    }
2230 }
2231 
2232 bool
handleWRSV(Instruction * i)2233 NVC0LoweringPass::handleWRSV(Instruction *i)
2234 {
2235    Instruction *st;
2236    Symbol *sym;
2237    uint32_t addr;
2238 
2239    // must replace, $sreg are not writeable
2240    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2241    if (addr >= 0x400)
2242       return false;
2243    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2244 
2245    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2246                     i->getSrc(1));
2247    st->perPatch = i->perPatch;
2248 
2249    bld.getBB()->remove(i);
2250    return true;
2251 }
2252 
2253 void
handleLDST(Instruction * i)2254 NVC0LoweringPass::handleLDST(Instruction *i)
2255 {
2256    if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2257       if (prog->getType() == Program::TYPE_COMPUTE) {
2258          i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2259          i->getSrc(0)->reg.fileIndex = 0;
2260       } else
2261       if (prog->getType() == Program::TYPE_GEOMETRY &&
2262           i->src(0).isIndirect(0)) {
2263          // XXX: this assumes vec4 units
2264          Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2265                                  i->getIndirect(0, 0), bld.mkImm(4));
2266          i->setIndirect(0, 0, ptr);
2267          i->op = OP_VFETCH;
2268       } else {
2269          i->op = OP_VFETCH;
2270          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2271       }
2272    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2273       if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2274           prog->getType() == Program::TYPE_COMPUTE) {
2275          // The launch descriptor only allows to set up 8 CBs, but OpenGL
2276          // requires at least 12 UBOs. To bypass this limitation, we store the
2277          // addrs into the driver constbuf and we directly load from the global
2278          // memory.
2279          int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2280          Value *ind = i->getIndirect(0, 1);
2281 
2282          if (!ind && fileIndex == -1)
2283             return;
2284 
2285          if (ind) {
2286             // Clamp the UBO index when an indirect access is used to avoid
2287             // loading information from the wrong place in the driver cb.
2288             // TODO - synchronize the max with the driver.
2289             ind = bld.mkOp2v(OP_MIN, TYPE_U32, ind,
2290                              bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2291                                         ind, bld.loadImm(NULL, fileIndex)),
2292                              bld.loadImm(NULL, 13));
2293             fileIndex = 0;
2294          }
2295 
2296          Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2297          Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2298          Value *length = loadUboLength32(ind, fileIndex * 16);
2299          Value *pred = new_LValue(func, FILE_PREDICATE);
2300          if (i->src(0).isIndirect(0)) {
2301             bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2302             bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2303          }
2304          i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2305          i->setIndirect(0, 1, NULL);
2306          i->setIndirect(0, 0, ptr);
2307          bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2308          i->setPredicate(CC_NOT_P, pred);
2309          Value *zero, *dst = i->getDef(0);
2310          i->setDef(0, bld.getSSA());
2311 
2312          bld.setPosition(i, true);
2313          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2314             ->setPredicate(CC_P, pred);
2315          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2316       } else if (i->src(0).isIndirect(1)) {
2317          Value *ptr;
2318          if (i->src(0).isIndirect(0))
2319             ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2320                              i->getIndirect(0, 1), bld.mkImm(0x1010),
2321                              i->getIndirect(0, 0));
2322          else
2323             ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2324                              i->getIndirect(0, 1), bld.mkImm(16));
2325          i->setIndirect(0, 1, NULL);
2326          i->setIndirect(0, 0, ptr);
2327          i->subOp = NV50_IR_SUBOP_LDC_IS;
2328       }
2329    } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2330       assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2331       i->op = OP_VFETCH;
2332    } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2333       Value *ind = i->getIndirect(0, 1);
2334       Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2335       // XXX come up with a way not to do this for EVERY little access but
2336       // rather to batch these up somehow. Unfortunately we've lost the
2337       // information about the field width by the time we get here.
2338       Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2339       Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2340       Value *pred = new_LValue(func, FILE_PREDICATE);
2341       if (i->src(0).isIndirect(0)) {
2342          bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2343          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2344       }
2345       i->setIndirect(0, 1, NULL);
2346       i->setIndirect(0, 0, ptr);
2347       i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2348       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2349       i->setPredicate(CC_NOT_P, pred);
2350       if (i->defExists(0)) {
2351          Value *zero, *dst = i->getDef(0);
2352          i->setDef(0, bld.getSSA());
2353 
2354          bld.setPosition(i, true);
2355          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2356             ->setPredicate(CC_P, pred);
2357          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2358       }
2359    }
2360 }
2361 
2362 void
readTessCoord(LValue * dst,int c)2363 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2364 {
2365    Value *laneid = bld.getSSA();
2366    Value *x, *y;
2367 
2368    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2369 
2370    if (c == 0) {
2371       x = dst;
2372       y = NULL;
2373    } else
2374    if (c == 1) {
2375       x = NULL;
2376       y = dst;
2377    } else {
2378       assert(c == 2);
2379       if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2380          bld.mkMov(dst, bld.loadImm(NULL, 0));
2381          return;
2382       }
2383       x = bld.getSSA();
2384       y = bld.getSSA();
2385    }
2386    if (x)
2387       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2388    if (y)
2389       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2390 
2391    if (c == 2) {
2392       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2393       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2394    }
2395 }
2396 
2397 bool
handleRDSV(Instruction * i)2398 NVC0LoweringPass::handleRDSV(Instruction *i)
2399 {
2400    Symbol *sym = i->getSrc(0)->asSym();
2401    const SVSemantic sv = sym->reg.data.sv.sv;
2402    Value *vtx = NULL;
2403    Instruction *ld;
2404    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2405 
2406    if (addr >= 0x400) {
2407       // mov $sreg
2408       if (sym->reg.data.sv.index == 3) {
2409          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2410          i->op = OP_MOV;
2411          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2412       }
2413       if (sv == SV_VERTEX_COUNT) {
2414          bld.setPosition(i, true);
2415          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2416       }
2417       return true;
2418    }
2419 
2420    switch (sv) {
2421    case SV_POSITION:
2422       assert(prog->getType() == Program::TYPE_FRAGMENT);
2423       if (i->srcExists(1)) {
2424          // Pass offset through to the interpolation logic
2425          ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2426                            i->getDef(0), addr, NULL);
2427          ld->setSrc(1, i->getSrc(1));
2428       } else {
2429          bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2430       }
2431       break;
2432    case SV_FACE:
2433    {
2434       Value *face = i->getDef(0);
2435       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2436       if (i->dType == TYPE_F32) {
2437          bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2438          bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2439          bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2440       }
2441    }
2442       break;
2443    case SV_TESS_COORD:
2444       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2445       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2446       break;
2447    case SV_NTID:
2448    case SV_NCTAID:
2449    case SV_GRIDID:
2450       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2451       if (sym->reg.data.sv.index == 3) {
2452          i->op = OP_MOV;
2453          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
2454          return true;
2455       }
2456       // Fallthrough
2457    case SV_WORK_DIM:
2458       addr += prog->driver->prop.cp.gridInfoBase;
2459       bld.mkLoad(TYPE_U32, i->getDef(0),
2460                  bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2461                               TYPE_U32, addr), NULL);
2462       break;
2463    case SV_SAMPLE_INDEX:
2464       // TODO: Properly pass source as an address in the PIX address space
2465       // (which can be of the form [r0+offset]). But this is currently
2466       // unnecessary.
2467       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2468       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2469       break;
2470    case SV_SAMPLE_POS: {
2471       Value *off = new_LValue(func, FILE_GPR);
2472       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2473       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2474       bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
2475       bld.mkLoad(TYPE_F32,
2476                  i->getDef(0),
2477                  bld.mkSymbol(
2478                        FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2479                        TYPE_U32, prog->driver->io.sampleInfoBase +
2480                        4 * sym->reg.data.sv.index),
2481                  off);
2482       break;
2483    }
2484    case SV_SAMPLE_MASK: {
2485       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2486       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
2487       Instruction *sampleid =
2488          bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2489       sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2490       Value *masked =
2491          bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
2492                     bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2493                                bld.loadImm(NULL, 1), sampleid->getDef(0)));
2494       if (prog->driver->prop.fp.persampleInvocation) {
2495          bld.mkMov(i->getDef(0), masked);
2496       } else {
2497          bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
2498                    bld.mkImm(0))
2499             ->subOp = 1;
2500       }
2501       break;
2502    }
2503    case SV_BASEVERTEX:
2504    case SV_BASEINSTANCE:
2505    case SV_DRAWID:
2506       ld = bld.mkLoad(TYPE_U32, i->getDef(0),
2507                       bld.mkSymbol(FILE_MEMORY_CONST,
2508                                    prog->driver->io.auxCBSlot,
2509                                    TYPE_U32,
2510                                    prog->driver->io.drawInfoBase +
2511                                    4 * (sv - SV_BASEVERTEX)),
2512                       NULL);
2513       break;
2514    default:
2515       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
2516          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2517       if (prog->getType() == Program::TYPE_FRAGMENT) {
2518          bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
2519       } else {
2520          ld = bld.mkFetch(i->getDef(0), i->dType,
2521                           FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
2522          ld->perPatch = i->perPatch;
2523       }
2524       break;
2525    }
2526    bld.getBB()->remove(i);
2527    return true;
2528 }
2529 
2530 bool
handleDIV(Instruction * i)2531 NVC0LoweringPass::handleDIV(Instruction *i)
2532 {
2533    if (!isFloatType(i->dType))
2534       return true;
2535    bld.setPosition(i, false);
2536    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
2537    i->op = OP_MUL;
2538    i->setSrc(1, rcp->getDef(0));
2539    return true;
2540 }
2541 
2542 bool
handleMOD(Instruction * i)2543 NVC0LoweringPass::handleMOD(Instruction *i)
2544 {
2545    if (!isFloatType(i->dType))
2546       return true;
2547    LValue *value = bld.getScratch(typeSizeof(i->dType));
2548    bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
2549    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
2550    bld.mkOp1(OP_TRUNC, i->dType, value, value);
2551    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
2552    i->op = OP_SUB;
2553    i->setSrc(1, value);
2554    return true;
2555 }
2556 
2557 bool
handleSQRT(Instruction * i)2558 NVC0LoweringPass::handleSQRT(Instruction *i)
2559 {
2560    if (i->dType == TYPE_F64) {
2561       Value *pred = bld.getSSA(1, FILE_PREDICATE);
2562       Value *zero = bld.loadImm(NULL, 0.0);
2563       Value *dst = bld.getSSA(8);
2564       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
2565       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
2566       bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
2567       i->op = OP_MUL;
2568       i->setSrc(1, dst);
2569       // TODO: Handle this properly with a library function
2570    } else {
2571       bld.setPosition(i, true);
2572       i->op = OP_RSQ;
2573       bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
2574    }
2575 
2576    return true;
2577 }
2578 
2579 bool
handlePOW(Instruction * i)2580 NVC0LoweringPass::handlePOW(Instruction *i)
2581 {
2582    LValue *val = bld.getScratch();
2583 
2584    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
2585    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
2586    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
2587 
2588    i->op = OP_EX2;
2589    i->setSrc(0, val);
2590    i->setSrc(1, NULL);
2591 
2592    return true;
2593 }
2594 
2595 bool
handleEXPORT(Instruction * i)2596 NVC0LoweringPass::handleEXPORT(Instruction *i)
2597 {
2598    if (prog->getType() == Program::TYPE_FRAGMENT) {
2599       int id = i->getSrc(0)->reg.data.offset / 4;
2600 
2601       if (i->src(0).isIndirect(0)) // TODO, ugly
2602          return false;
2603       i->op = OP_MOV;
2604       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
2605       i->src(0).set(i->src(1));
2606       i->setSrc(1, NULL);
2607       i->setDef(0, new_LValue(func, FILE_GPR));
2608       i->getDef(0)->reg.data.id = id;
2609 
2610       prog->maxGPR = MAX2(prog->maxGPR, id);
2611    } else
2612    if (prog->getType() == Program::TYPE_GEOMETRY) {
2613       i->setIndirect(0, 1, gpEmitAddress);
2614    }
2615    return true;
2616 }
2617 
2618 bool
handleOUT(Instruction * i)2619 NVC0LoweringPass::handleOUT(Instruction *i)
2620 {
2621    Instruction *prev = i->prev;
2622    ImmediateValue stream, prevStream;
2623 
2624    // Only merge if the stream ids match. Also, note that the previous
2625    // instruction would have already been lowered, so we take arg1 from it.
2626    if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
2627        i->src(0).getImmediate(stream) &&
2628        prev->src(1).getImmediate(prevStream) &&
2629        stream.reg.data.u32 == prevStream.reg.data.u32) {
2630       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
2631       delete_Instruction(prog, i);
2632    } else {
2633       assert(gpEmitAddress);
2634       i->setDef(0, gpEmitAddress);
2635       i->setSrc(1, i->getSrc(0));
2636       i->setSrc(0, gpEmitAddress);
2637    }
2638    return true;
2639 }
2640 
2641 // Generate a binary predicate if an instruction is predicated by
2642 // e.g. an f32 value.
2643 void
checkPredicate(Instruction * insn)2644 NVC0LoweringPass::checkPredicate(Instruction *insn)
2645 {
2646    Value *pred = insn->getPredicate();
2647    Value *pdst;
2648 
2649    if (!pred || pred->reg.file == FILE_PREDICATE)
2650       return;
2651    pdst = new_LValue(func, FILE_PREDICATE);
2652 
2653    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
2654    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
2655 
2656    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
2657 
2658    insn->setPredicate(insn->cc, pdst);
2659 }
2660 
2661 //
2662 // - add quadop dance for texturing
2663 // - put FP outputs in GPRs
2664 // - convert instruction sequences
2665 //
2666 bool
visit(Instruction * i)2667 NVC0LoweringPass::visit(Instruction *i)
2668 {
2669    bool ret = true;
2670    bld.setPosition(i, false);
2671 
2672    if (i->cc != CC_ALWAYS)
2673       checkPredicate(i);
2674 
2675    switch (i->op) {
2676    case OP_TEX:
2677    case OP_TXB:
2678    case OP_TXL:
2679    case OP_TXF:
2680    case OP_TXG:
2681       return handleTEX(i->asTex());
2682    case OP_TXD:
2683       return handleTXD(i->asTex());
2684    case OP_TXLQ:
2685       return handleTXLQ(i->asTex());
2686    case OP_TXQ:
2687      return handleTXQ(i->asTex());
2688    case OP_EX2:
2689       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
2690       i->setSrc(0, i->getDef(0));
2691       break;
2692    case OP_POW:
2693       return handlePOW(i);
2694    case OP_DIV:
2695       return handleDIV(i);
2696    case OP_MOD:
2697       return handleMOD(i);
2698    case OP_SQRT:
2699       return handleSQRT(i);
2700    case OP_EXPORT:
2701       ret = handleEXPORT(i);
2702       break;
2703    case OP_EMIT:
2704    case OP_RESTART:
2705       return handleOUT(i);
2706    case OP_RDSV:
2707       return handleRDSV(i);
2708    case OP_WRSV:
2709       return handleWRSV(i);
2710    case OP_STORE:
2711    case OP_LOAD:
2712       handleLDST(i);
2713       break;
2714    case OP_ATOM:
2715    {
2716       const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
2717       handleATOM(i);
2718       handleCasExch(i, cctl);
2719    }
2720       break;
2721    case OP_SULDB:
2722    case OP_SULDP:
2723    case OP_SUSTB:
2724    case OP_SUSTP:
2725    case OP_SUREDB:
2726    case OP_SUREDP:
2727       if (targ->getChipset() >= NVISA_GM107_CHIPSET)
2728          handleSurfaceOpGM107(i->asTex());
2729       else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
2730          handleSurfaceOpNVE4(i->asTex());
2731       else
2732          handleSurfaceOpNVC0(i->asTex());
2733       break;
2734    case OP_SUQ:
2735       handleSUQ(i->asTex());
2736       break;
2737    case OP_BUFQ:
2738       handleBUFQ(i);
2739       break;
2740    default:
2741       break;
2742    }
2743 
2744    /* Kepler+ has a special opcode to compute a new base address to be used
2745     * for indirect loads.
2746     *
2747     * Maxwell+ has an additional similar requirement for indirect
2748     * interpolation ops in frag shaders.
2749     */
2750    bool doAfetch = false;
2751    if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2752        !i->perPatch &&
2753        (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
2754        i->src(0).isIndirect(0)) {
2755       doAfetch = true;
2756    }
2757    if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
2758        (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
2759        i->src(0).isIndirect(0)) {
2760       doAfetch = true;
2761    }
2762 
2763    if (doAfetch) {
2764       Value *addr = cloneShallow(func, i->getSrc(0));
2765       Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
2766                                       i->getSrc(0));
2767       afetch->setIndirect(0, 0, i->getIndirect(0, 0));
2768       addr->reg.data.offset = 0;
2769       i->setSrc(0, addr);
2770       i->setIndirect(0, 0, afetch->getDef(0));
2771    }
2772 
2773    return ret;
2774 }
2775 
2776 bool
runLegalizePass(Program * prog,CGStage stage) const2777 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
2778 {
2779    if (stage == CG_STAGE_PRE_SSA) {
2780       NVC0LoweringPass pass(prog);
2781       return pass.run(prog, false, true);
2782    } else
2783    if (stage == CG_STAGE_POST_RA) {
2784       NVC0LegalizePostRA pass(prog);
2785       return pass.run(prog, false, true);
2786    } else
2787    if (stage == CG_STAGE_SSA) {
2788       NVC0LegalizeSSA pass;
2789       return pass.run(prog, false, true);
2790    }
2791    return false;
2792 }
2793 
2794 } // namespace nv50_ir
2795