1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "nv50/codegen/nv50_ir.h"
24 #include "nv50/codegen/nv50_ir_build_util.h"
25
26 #include "nv50_ir_target_nvc0.h"
27
28 #include <limits>
29
30 namespace nv50_ir {
31
32 #define QOP_ADD 0
33 #define QOP_SUBR 1
34 #define QOP_SUB 2
35 #define QOP_MOV2 3
36
37 // UL UR LL LR
38 #define QUADOP(q, r, s, t) \
39 ((QOP_##q << 6) | (QOP_##r << 4) | \
40 (QOP_##s << 2) | (QOP_##t << 0))
41
42 class NVC0LegalizeSSA : public Pass
43 {
44 private:
45 virtual bool visit(BasicBlock *);
46 virtual bool visit(Function *);
47
48 // we want to insert calls to the builtin library only after optimization
49 void handleDIV(Instruction *); // integer division, modulus
50 void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
51
52 private:
53 BuildUtil bld;
54 };
55
56 void
handleDIV(Instruction * i)57 NVC0LegalizeSSA::handleDIV(Instruction *i)
58 {
59 FlowInstruction *call;
60 int builtin;
61 Value *def[2];
62
63 bld.setPosition(i, false);
64 def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
65 def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
66 switch (i->dType) {
67 case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
68 case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
69 default:
70 return;
71 }
72 call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
73 bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
74 bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
75 bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
76
77 call->fixed = 1;
78 call->absolute = call->builtin = 1;
79 call->target.builtin = builtin;
80 delete_Instruction(prog, i);
81 }
82
83 void
handleRCPRSQ(Instruction * i)84 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
85 {
86 // TODO
87 }
88
89 bool
visit(Function * fn)90 NVC0LegalizeSSA::visit(Function *fn)
91 {
92 bld.setProgram(fn->getProgram());
93 return true;
94 }
95
96 bool
visit(BasicBlock * bb)97 NVC0LegalizeSSA::visit(BasicBlock *bb)
98 {
99 Instruction *next;
100 for (Instruction *i = bb->getEntry(); i; i = next) {
101 next = i->next;
102 if (i->dType == TYPE_F32)
103 continue;
104 switch (i->op) {
105 case OP_DIV:
106 case OP_MOD:
107 handleDIV(i);
108 break;
109 case OP_RCP:
110 case OP_RSQ:
111 if (i->dType == TYPE_F64)
112 handleRCPRSQ(i);
113 break;
114 default:
115 break;
116 }
117 }
118 return true;
119 }
120
121 class NVC0LegalizePostRA : public Pass
122 {
123 public:
124 NVC0LegalizePostRA(const Program *);
125
126 private:
127 virtual bool visit(Function *);
128 virtual bool visit(BasicBlock *);
129
130 void replaceZero(Instruction *);
131 void split64BitOp(Instruction *);
132 bool tryReplaceContWithBra(BasicBlock *);
133 void propagateJoin(BasicBlock *);
134
135 struct TexUse
136 {
TexUsenv50_ir::NVC0LegalizePostRA::TexUse137 TexUse(Instruction *use, const Instruction *tex)
138 : insn(use), tex(tex), level(-1) { }
139 Instruction *insn;
140 const Instruction *tex; // or split / mov
141 int level;
142 };
143 struct Limits
144 {
Limitsnv50_ir::NVC0LegalizePostRA::Limits145 Limits() { }
Limitsnv50_ir::NVC0LegalizePostRA::Limits146 Limits(int min, int max) : min(min), max(max) { }
147 int min, max;
148 };
149 bool insertTextureBarriers(Function *);
150 inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
151 void findFirstUses(const Instruction *tex, const Instruction *def,
152 std::list<TexUse>&);
153 void findOverwritingDefs(const Instruction *tex, Instruction *insn,
154 const BasicBlock *term,
155 std::list<TexUse>&);
156 void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
157 const Instruction *recurseDef(const Instruction *);
158
159 private:
160 LValue *r63;
161 const bool needTexBar;
162 };
163
NVC0LegalizePostRA(const Program * prog)164 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
165 : needTexBar(prog->getTarget()->getChipset() >= 0xe0)
166 {
167 }
168
169 bool
insnDominatedBy(const Instruction * later,const Instruction * early) const170 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
171 const Instruction *early) const
172 {
173 if (early->bb == later->bb)
174 return early->serial < later->serial;
175 return later->bb->dominatedBy(early->bb);
176 }
177
178 void
addTexUse(std::list<TexUse> & uses,Instruction * usei,const Instruction * insn)179 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
180 Instruction *usei, const Instruction *insn)
181 {
182 bool add = true;
183 for (std::list<TexUse>::iterator it = uses.begin();
184 it != uses.end();) {
185 if (insnDominatedBy(usei, it->insn)) {
186 add = false;
187 break;
188 }
189 if (insnDominatedBy(it->insn, usei))
190 it = uses.erase(it);
191 else
192 ++it;
193 }
194 if (add)
195 uses.push_back(TexUse(usei, insn));
196 }
197
198 void
findOverwritingDefs(const Instruction * texi,Instruction * insn,const BasicBlock * term,std::list<TexUse> & uses)199 NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
200 Instruction *insn,
201 const BasicBlock *term,
202 std::list<TexUse> &uses)
203 {
204 while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
205 insn = insn->getSrc(0)->getUniqueInsn();
206
207 if (!insn || !insn->bb->reachableBy(texi->bb, term))
208 return;
209
210 switch (insn->op) {
211 /* Values not connected to the tex's definition through any of these should
212 * not be conflicting.
213 */
214 case OP_SPLIT:
215 case OP_MERGE:
216 case OP_PHI:
217 case OP_UNION:
218 /* recurse again */
219 for (int s = 0; insn->srcExists(s); ++s)
220 findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
221 uses);
222 break;
223 default:
224 // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
225 addTexUse(uses, insn, texi);
226 break;
227 }
228 }
229
230 void
findFirstUses(const Instruction * texi,const Instruction * insn,std::list<TexUse> & uses)231 NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
232 const Instruction *insn,
233 std::list<TexUse> &uses)
234 {
235 for (int d = 0; insn->defExists(d); ++d) {
236 Value *v = insn->getDef(d);
237 for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
238 Instruction *usei = (*u)->getInsn();
239
240 if (usei->op == OP_PHI || usei->op == OP_UNION) {
241 // need a barrier before WAW cases
242 for (int s = 0; usei->srcExists(s); ++s) {
243 Instruction *defi = usei->getSrc(s)->getUniqueInsn();
244 if (defi && &usei->src(s) != *u)
245 findOverwritingDefs(texi, defi, usei->bb, uses);
246 }
247 }
248
249 if (usei->op == OP_SPLIT ||
250 usei->op == OP_MERGE ||
251 usei->op == OP_PHI ||
252 usei->op == OP_UNION) {
253 // these uses don't manifest in the machine code
254 findFirstUses(texi, usei, uses);
255 } else
256 if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
257 usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
258 findFirstUses(texi, usei, uses);
259 } else {
260 addTexUse(uses, usei, insn);
261 }
262 }
263 }
264 }
265
266 // Texture barriers:
267 // This pass is a bit long and ugly and can probably be optimized.
268 //
269 // 1. obtain a list of TEXes and their outputs' first use(s)
270 // 2. calculate the barrier level of each first use (minimal number of TEXes,
271 // over all paths, between the TEX and the use in question)
272 // 3. for each barrier, if all paths from the source TEX to that barrier
273 // contain a barrier of lesser level, it can be culled
274 bool
insertTextureBarriers(Function * fn)275 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
276 {
277 std::list<TexUse> *uses;
278 std::vector<Instruction *> texes;
279 std::vector<int> bbFirstTex;
280 std::vector<int> bbFirstUse;
281 std::vector<int> texCounts;
282 std::vector<TexUse> useVec;
283 ArrayList insns;
284
285 fn->orderInstructions(insns);
286
287 texCounts.resize(fn->allBBlocks.getSize(), 0);
288 bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
289 bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
290
291 // tag BB CFG nodes by their id for later
292 for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
293 BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
294 if (bb)
295 bb->cfg.tag = bb->getId();
296 }
297
298 // gather the first uses for each TEX
299 for (int i = 0; i < insns.getSize(); ++i) {
300 Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
301 if (isTextureOp(tex->op)) {
302 texes.push_back(tex);
303 if (!texCounts.at(tex->bb->getId()))
304 bbFirstTex[tex->bb->getId()] = texes.size() - 1;
305 texCounts[tex->bb->getId()]++;
306 }
307 }
308 insns.clear();
309 if (texes.empty())
310 return false;
311 uses = new std::list<TexUse>[texes.size()];
312 if (!uses)
313 return false;
314 for (size_t i = 0; i < texes.size(); ++i)
315 findFirstUses(texes[i], texes[i], uses[i]);
316
317 // determine the barrier level at each use
318 for (size_t i = 0; i < texes.size(); ++i) {
319 for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
320 ++u) {
321 BasicBlock *tb = texes[i]->bb;
322 BasicBlock *ub = u->insn->bb;
323 if (tb == ub) {
324 u->level = 0;
325 for (size_t j = i + 1; j < texes.size() &&
326 texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
327 ++j)
328 u->level++;
329 } else {
330 u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
331 &ub->cfg, texCounts);
332 if (u->level < 0) {
333 WARN("Failed to find path TEX -> TEXBAR\n");
334 u->level = 0;
335 continue;
336 }
337 // this counted all TEXes in the origin block, correct that
338 u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
339 // and did not count the TEXes in the destination block, add those
340 for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
341 texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
342 ++j)
343 u->level++;
344 }
345 assert(u->level >= 0);
346 useVec.push_back(*u);
347 }
348 }
349 delete[] uses;
350 uses = NULL;
351
352 // insert the barriers
353 for (size_t i = 0; i < useVec.size(); ++i) {
354 Instruction *prev = useVec[i].insn->prev;
355 if (useVec[i].level < 0)
356 continue;
357 if (prev && prev->op == OP_TEXBAR) {
358 if (prev->subOp > useVec[i].level)
359 prev->subOp = useVec[i].level;
360 prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
361 } else {
362 Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
363 bar->fixed = 1;
364 bar->subOp = useVec[i].level;
365 // make use explicit to ease latency calculation
366 bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
367 useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
368 }
369 }
370
371 if (fn->getProgram()->optLevel < 3) {
372 if (uses)
373 delete[] uses;
374 return true;
375 }
376
377 std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
378
379 limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
380 limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
381 limitS.resize(fn->allBBlocks.getSize());
382
383 // cull unneeded barriers (should do that earlier, but for simplicity)
384 IteratorRef bi = fn->cfg.iteratorCFG();
385 // first calculate min/max outstanding TEXes for each BB
386 for (bi->reset(); !bi->end(); bi->next()) {
387 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
388 BasicBlock *bb = BasicBlock::get(n);
389 int min = 0;
390 int max = std::numeric_limits<int>::max();
391 for (Instruction *i = bb->getFirst(); i; i = i->next) {
392 if (isTextureOp(i->op)) {
393 min++;
394 if (max < std::numeric_limits<int>::max())
395 max++;
396 } else
397 if (i->op == OP_TEXBAR) {
398 min = MIN2(min, i->subOp);
399 max = MIN2(max, i->subOp);
400 }
401 }
402 // limits when looking at an isolated block
403 limitS[bb->getId()].min = min;
404 limitS[bb->getId()].max = max;
405 }
406 // propagate the min/max values
407 for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
408 for (bi->reset(); !bi->end(); bi->next()) {
409 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
410 BasicBlock *bb = BasicBlock::get(n);
411 const int bbId = bb->getId();
412 for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
413 BasicBlock *in = BasicBlock::get(ei.getNode());
414 const int inId = in->getId();
415 limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
416 limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
417 }
418 // I just hope this is correct ...
419 if (limitS[bbId].max == std::numeric_limits<int>::max()) {
420 // no barrier
421 limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
422 limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
423 } else {
424 // block contained a barrier
425 limitB[bbId].min = MIN2(limitS[bbId].max,
426 limitT[bbId].min + limitS[bbId].min);
427 limitB[bbId].max = MIN2(limitS[bbId].max,
428 limitT[bbId].max + limitS[bbId].min);
429 }
430 }
431 }
432 // finally delete unnecessary barriers
433 for (bi->reset(); !bi->end(); bi->next()) {
434 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
435 BasicBlock *bb = BasicBlock::get(n);
436 Instruction *prev = NULL;
437 Instruction *next;
438 int max = limitT[bb->getId()].max;
439 for (Instruction *i = bb->getFirst(); i; i = next) {
440 next = i->next;
441 if (i->op == OP_TEXBAR) {
442 if (i->subOp >= max) {
443 delete_Instruction(prog, i);
444 } else {
445 max = i->subOp;
446 if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
447 delete_Instruction(prog, prev);
448 prev = NULL;
449 }
450 }
451 } else
452 if (isTextureOp(i->op)) {
453 max++;
454 }
455 if (!i->isNop())
456 prev = i;
457 }
458 }
459 if (uses)
460 delete[] uses;
461 return true;
462 }
463
464 bool
visit(Function * fn)465 NVC0LegalizePostRA::visit(Function *fn)
466 {
467 if (needTexBar)
468 insertTextureBarriers(fn);
469
470 r63 = new_LValue(fn, FILE_GPR);
471 r63->reg.data.id = 63;
472 return true;
473 }
474
475 void
replaceZero(Instruction * i)476 NVC0LegalizePostRA::replaceZero(Instruction *i)
477 {
478 for (int s = 0; i->srcExists(s); ++s) {
479 ImmediateValue *imm = i->getSrc(s)->asImm();
480 if (imm && imm->reg.data.u64 == 0)
481 i->setSrc(s, r63);
482 }
483 }
484
485 void
split64BitOp(Instruction * i)486 NVC0LegalizePostRA::split64BitOp(Instruction *i)
487 {
488 if (i->dType == TYPE_F64) {
489 if (i->op == OP_MAD)
490 i->op = OP_FMA;
491 if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
492 i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
493 i->op == OP_SET)
494 return;
495 i->dType = i->sType = TYPE_U32;
496
497 i->bb->insertAfter(i, cloneForward(func, i));
498 }
499 }
500
501 // replace CONT with BRA for single unconditional continue
502 bool
tryReplaceContWithBra(BasicBlock * bb)503 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
504 {
505 if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
506 return false;
507 Graph::EdgeIterator ei = bb->cfg.incident();
508 if (ei.getType() != Graph::Edge::BACK)
509 ei.next();
510 if (ei.getType() != Graph::Edge::BACK)
511 return false;
512 BasicBlock *contBB = BasicBlock::get(ei.getNode());
513
514 if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
515 contBB->getExit()->getPredicate())
516 return false;
517 contBB->getExit()->op = OP_BRA;
518 bb->remove(bb->getEntry()); // delete PRECONT
519
520 ei.next();
521 assert(ei.end() || ei.getType() != Graph::Edge::BACK);
522 return true;
523 }
524
525 // replace branches to join blocks with join ops
526 void
propagateJoin(BasicBlock * bb)527 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
528 {
529 if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
530 return;
531 for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
532 BasicBlock *in = BasicBlock::get(ei.getNode());
533 Instruction *exit = in->getExit();
534 if (!exit) {
535 in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
536 // there should always be a terminator instruction
537 WARN("inserted missing terminator in BB:%i\n", in->getId());
538 } else
539 if (exit->op == OP_BRA) {
540 exit->op = OP_JOIN;
541 exit->asFlow()->limit = 1; // must-not-propagate marker
542 }
543 }
544 bb->remove(bb->getEntry());
545 }
546
547 bool
visit(BasicBlock * bb)548 NVC0LegalizePostRA::visit(BasicBlock *bb)
549 {
550 Instruction *i, *next;
551
552 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
553 for (i = bb->getFirst(); i; i = next) {
554 next = i->next;
555 if (i->op == OP_EMIT || i->op == OP_RESTART) {
556 if (!i->getDef(0)->refCount())
557 i->setDef(0, NULL);
558 if (i->src(0).getFile() == FILE_IMMEDIATE)
559 i->setSrc(0, r63); // initial value must be 0
560 } else
561 if (i->isNop()) {
562 bb->remove(i);
563 } else {
564 if (i->op != OP_MOV && i->op != OP_PFETCH)
565 replaceZero(i);
566 if (typeSizeof(i->dType) == 8)
567 split64BitOp(i);
568 }
569 }
570 if (!bb->getEntry())
571 return true;
572
573 if (!tryReplaceContWithBra(bb))
574 propagateJoin(bb);
575
576 return true;
577 }
578
579 class NVC0LoweringPass : public Pass
580 {
581 public:
582 NVC0LoweringPass(Program *);
583
584 private:
585 virtual bool visit(Function *);
586 virtual bool visit(BasicBlock *);
587 virtual bool visit(Instruction *);
588
589 bool handleRDSV(Instruction *);
590 bool handleWRSV(Instruction *);
591 bool handleEXPORT(Instruction *);
592 bool handleOUT(Instruction *);
593 bool handleDIV(Instruction *);
594 bool handleMOD(Instruction *);
595 bool handleSQRT(Instruction *);
596 bool handlePOW(Instruction *);
597 bool handleTEX(TexInstruction *);
598 bool handleTXD(TexInstruction *);
599 bool handleTXQ(TexInstruction *);
600 bool handleManualTXD(TexInstruction *);
601
602 void checkPredicate(Instruction *);
603
604 void readTessCoord(LValue *dst, int c);
605
606 private:
607 const Target *const targ;
608
609 BuildUtil bld;
610
611 LValue *gpEmitAddress;
612 };
613
NVC0LoweringPass(Program * prog)614 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
615 {
616 bld.setProgram(prog);
617 }
618
619 bool
visit(Function * fn)620 NVC0LoweringPass::visit(Function *fn)
621 {
622 if (prog->getType() == Program::TYPE_GEOMETRY) {
623 assert(!strncmp(fn->getName(), "MAIN", 4));
624 // TODO: when we generate actual functions pass this value along somehow
625 bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
626 gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
627 if (fn->cfgExit) {
628 bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
629 bld.mkMovToReg(0, gpEmitAddress);
630 }
631 }
632 return true;
633 }
634
635 bool
visit(BasicBlock * bb)636 NVC0LoweringPass::visit(BasicBlock *bb)
637 {
638 return true;
639 }
640
641 // move array source to first slot, convert to u16, add indirections
642 bool
handleTEX(TexInstruction * i)643 NVC0LoweringPass::handleTEX(TexInstruction *i)
644 {
645 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
646 const int arg = i->tex.target.getArgCount();
647
648 if (prog->getTarget()->getChipset() >= 0xe0) {
649 if (i->tex.r == i->tex.s) {
650 i->tex.r += 8; // NOTE: offset should probably be a driver option
651 i->tex.s = 0; // only a single cX[] value possible here
652 } else {
653 // TODO: extract handles and use register to select TIC/TSC entries
654 }
655 if (i->tex.target.isArray()) {
656 LValue *layer = new_LValue(func, FILE_GPR);
657 Value *src = i->getSrc(arg - 1);
658 const int sat = (i->op == OP_TXF) ? 1 : 0;
659 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
660 bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
661 for (int s = dim; s >= 1; --s)
662 i->setSrc(s, i->getSrc(s - 1));
663 i->setSrc(0, layer);
664 }
665 if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
666 Value *tmp[2];
667 Symbol *bind;
668 Value *rRel = i->getIndirectR();
669 Value *sRel = i->getIndirectS();
670 Value *shCnt = bld.loadImm(NULL, 2);
671
672 if (rRel) {
673 tmp[0] = bld.getScratch();
674 bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.r * 4);
675 bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], rRel, shCnt);
676 tmp[1] = bld.mkLoad(TYPE_U32, bind, tmp[0]);
677 bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
678 bld.loadImm(tmp[0], 0x00ffffffu));
679 rRel = tmp[0];
680 i->setSrc(i->tex.rIndirectSrc, NULL);
681 }
682 if (sRel) {
683 tmp[0] = bld.getScratch();
684 bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.s * 4);
685 bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], sRel, shCnt);
686 tmp[1] = bld.mkLoad(TYPE_U32, bind, tmp[0]);
687 bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
688 bld.loadImm(tmp[0], 0xff000000u));
689 sRel = tmp[0];
690 i->setSrc(i->tex.sIndirectSrc, NULL);
691 }
692 bld.mkOp2(OP_OR, TYPE_U32, rRel, rRel, sRel);
693
694 int min = i->tex.rIndirectSrc;
695 if (min < 0 || min > i->tex.sIndirectSrc)
696 min = i->tex.sIndirectSrc;
697 for (int s = min; s >= 1; --s)
698 i->setSrc(s, i->getSrc(s - 1));
699 i->setSrc(0, rRel);
700 }
701 } else
702 // (nvc0) generate and move the tsc/tic/array source to the front
703 if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
704 LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
705
706 Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(arg - 1) : NULL;
707 for (int s = dim; s >= 1; --s)
708 i->setSrc(s, i->getSrc(s - 1));
709 i->setSrc(0, arrayIndex);
710
711 Value *ticRel = i->getIndirectR();
712 Value *tscRel = i->getIndirectS();
713
714 if (arrayIndex) {
715 int sat = (i->op == OP_TXF) ? 1 : 0;
716 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
717 bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
718 } else {
719 bld.loadImm(src, 0);
720 }
721
722 if (ticRel) {
723 i->setSrc(i->tex.rIndirectSrc, NULL);
724 bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
725 }
726 if (tscRel) {
727 i->setSrc(i->tex.sIndirectSrc, NULL);
728 bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
729 }
730
731 i->setSrc(0, src);
732 }
733
734 // offset is last source (lod 1st, dc 2nd)
735 if (i->tex.useOffsets) {
736 uint32_t value = 0;
737 int n, c;
738 int s = i->srcCount(0xff);
739 for (n = 0; n < i->tex.useOffsets; ++n)
740 for (c = 0; c < 3; ++c)
741 value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
742 i->setSrc(s, bld.loadImm(NULL, value));
743 }
744
745 return true;
746 }
747
748 bool
handleManualTXD(TexInstruction * i)749 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
750 {
751 static const uint8_t qOps[4][2] =
752 {
753 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
754 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
755 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
756 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
757 };
758 Value *def[4][4];
759 Value *crd[3];
760 Instruction *tex;
761 Value *zero = bld.loadImm(bld.getSSA(), 0);
762 int l, c;
763 const int dim = i->tex.target.getDim();
764
765 i->op = OP_TEX; // no need to clone dPdx/dPdy later
766
767 for (c = 0; c < dim; ++c)
768 crd[c] = bld.getScratch();
769
770 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
771 for (l = 0; l < 4; ++l) {
772 // mov coordinates from lane l to all lanes
773 for (c = 0; c < dim; ++c)
774 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
775 // add dPdx from lane l to lanes dx
776 for (c = 0; c < dim; ++c)
777 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
778 // add dPdy from lane l to lanes dy
779 for (c = 0; c < dim; ++c)
780 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
781 // texture
782 bld.insert(tex = cloneForward(func, i));
783 for (c = 0; c < dim; ++c)
784 tex->setSrc(c, crd[c]);
785 // save results
786 for (c = 0; i->defExists(c); ++c) {
787 Instruction *mov;
788 def[c][l] = bld.getSSA();
789 mov = bld.mkMov(def[c][l], tex->getDef(c));
790 mov->fixed = 1;
791 mov->lanes = 1 << l;
792 }
793 }
794 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
795
796 for (c = 0; i->defExists(c); ++c) {
797 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
798 for (l = 0; l < 4; ++l)
799 u->setSrc(l, def[c][l]);
800 }
801
802 i->bb->remove(i);
803 return true;
804 }
805
806 bool
handleTXD(TexInstruction * txd)807 NVC0LoweringPass::handleTXD(TexInstruction *txd)
808 {
809 int dim = txd->tex.target.getDim();
810 int arg = txd->tex.target.getArgCount();
811
812 handleTEX(txd);
813 while (txd->srcExists(arg))
814 ++arg;
815
816 txd->tex.derivAll = true;
817 if (dim > 2 ||
818 txd->tex.target.isCube() ||
819 arg > 4 ||
820 txd->tex.target.isShadow())
821 return handleManualTXD(txd);
822
823 for (int c = 0; c < dim; ++c) {
824 txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
825 txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
826 txd->dPdx[c].set(NULL);
827 txd->dPdy[c].set(NULL);
828 }
829 return true;
830 }
831
832 bool
handleTXQ(TexInstruction * txq)833 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
834 {
835 // TODO: indirect resource/sampler index
836 return true;
837 }
838
839 bool
handleWRSV(Instruction * i)840 NVC0LoweringPass::handleWRSV(Instruction *i)
841 {
842 Instruction *st;
843 Symbol *sym;
844 uint32_t addr;
845
846 // must replace, $sreg are not writeable
847 addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
848 if (addr >= 0x400)
849 return false;
850 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
851
852 st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
853 i->getSrc(1));
854 st->perPatch = i->perPatch;
855
856 bld.getBB()->remove(i);
857 return true;
858 }
859
860 void
readTessCoord(LValue * dst,int c)861 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
862 {
863 Value *laneid = bld.getSSA();
864 Value *x, *y;
865
866 bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
867
868 if (c == 0) {
869 x = dst;
870 y = NULL;
871 } else
872 if (c == 1) {
873 x = NULL;
874 y = dst;
875 } else {
876 assert(c == 2);
877 x = bld.getSSA();
878 y = bld.getSSA();
879 }
880 if (x)
881 bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
882 if (y)
883 bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
884
885 if (c == 2) {
886 bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
887 bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
888 }
889 }
890
891 bool
handleRDSV(Instruction * i)892 NVC0LoweringPass::handleRDSV(Instruction *i)
893 {
894 Symbol *sym = i->getSrc(0)->asSym();
895 Value *vtx = NULL;
896 Instruction *ld;
897 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
898
899 if (addr >= 0x400) // mov $sreg
900 return true;
901
902 switch (i->getSrc(0)->reg.data.sv.sv) {
903 case SV_POSITION:
904 assert(prog->getType() == Program::TYPE_FRAGMENT);
905 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
906 break;
907 case SV_FACE:
908 {
909 Value *face = i->getDef(0);
910 bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
911 if (i->dType == TYPE_F32) {
912 bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
913 bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
914 }
915 }
916 break;
917 case SV_TESS_COORD:
918 assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
919 readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
920 break;
921 default:
922 if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
923 vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
924 ld = bld.mkFetch(i->getDef(0), i->dType,
925 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
926 ld->perPatch = i->perPatch;
927 break;
928 }
929 bld.getBB()->remove(i);
930 return true;
931 }
932
933 bool
handleDIV(Instruction * i)934 NVC0LoweringPass::handleDIV(Instruction *i)
935 {
936 if (!isFloatType(i->dType))
937 return true;
938 bld.setPosition(i, false);
939 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
940 i->op = OP_MUL;
941 i->setSrc(1, rcp->getDef(0));
942 return true;
943 }
944
945 bool
handleMOD(Instruction * i)946 NVC0LoweringPass::handleMOD(Instruction *i)
947 {
948 if (i->dType != TYPE_F32)
949 return true;
950 LValue *value = bld.getScratch();
951 bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
952 bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
953 bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
954 bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
955 i->op = OP_SUB;
956 i->setSrc(1, value);
957 return true;
958 }
959
960 bool
handleSQRT(Instruction * i)961 NVC0LoweringPass::handleSQRT(Instruction *i)
962 {
963 Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
964 bld.getSSA(), i->getSrc(0));
965 i->op = OP_MUL;
966 i->setSrc(1, rsq->getDef(0));
967
968 return true;
969 }
970
971 bool
handlePOW(Instruction * i)972 NVC0LoweringPass::handlePOW(Instruction *i)
973 {
974 LValue *val = bld.getScratch();
975
976 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
977 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
978 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
979
980 i->op = OP_EX2;
981 i->setSrc(0, val);
982 i->setSrc(1, NULL);
983
984 return true;
985 }
986
987 bool
handleEXPORT(Instruction * i)988 NVC0LoweringPass::handleEXPORT(Instruction *i)
989 {
990 if (prog->getType() == Program::TYPE_FRAGMENT) {
991 int id = i->getSrc(0)->reg.data.offset / 4;
992
993 if (i->src(0).isIndirect(0)) // TODO, ugly
994 return false;
995 i->op = OP_MOV;
996 i->subOp = NV50_IR_SUBOP_MOV_FINAL;
997 i->src(0).set(i->src(1));
998 i->setSrc(1, NULL);
999 i->setDef(0, new_LValue(func, FILE_GPR));
1000 i->getDef(0)->reg.data.id = id;
1001
1002 prog->maxGPR = MAX2(prog->maxGPR, id);
1003 } else
1004 if (prog->getType() == Program::TYPE_GEOMETRY) {
1005 i->setIndirect(0, 1, gpEmitAddress);
1006 }
1007 return true;
1008 }
1009
1010 bool
handleOUT(Instruction * i)1011 NVC0LoweringPass::handleOUT(Instruction *i)
1012 {
1013 if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
1014 i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
1015 delete_Instruction(prog, i);
1016 } else {
1017 assert(gpEmitAddress);
1018 i->setDef(0, gpEmitAddress);
1019 if (i->srcExists(0))
1020 i->setSrc(1, i->getSrc(0));
1021 i->setSrc(0, gpEmitAddress);
1022 }
1023 return true;
1024 }
1025
1026 // Generate a binary predicate if an instruction is predicated by
1027 // e.g. an f32 value.
1028 void
checkPredicate(Instruction * insn)1029 NVC0LoweringPass::checkPredicate(Instruction *insn)
1030 {
1031 Value *pred = insn->getPredicate();
1032 Value *pdst;
1033
1034 if (!pred || pred->reg.file == FILE_PREDICATE)
1035 return;
1036 pdst = new_LValue(func, FILE_PREDICATE);
1037
1038 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
1039 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
1040
1041 bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, pdst, bld.mkImm(0), pred);
1042
1043 insn->setPredicate(insn->cc, pdst);
1044 }
1045
1046 //
1047 // - add quadop dance for texturing
1048 // - put FP outputs in GPRs
1049 // - convert instruction sequences
1050 //
1051 bool
visit(Instruction * i)1052 NVC0LoweringPass::visit(Instruction *i)
1053 {
1054 bld.setPosition(i, false);
1055
1056 if (i->cc != CC_ALWAYS)
1057 checkPredicate(i);
1058
1059 switch (i->op) {
1060 case OP_TEX:
1061 case OP_TXB:
1062 case OP_TXL:
1063 case OP_TXF:
1064 case OP_TXG:
1065 return handleTEX(i->asTex());
1066 case OP_TXD:
1067 return handleTXD(i->asTex());
1068 case OP_TXQ:
1069 return handleTXQ(i->asTex());
1070 case OP_EX2:
1071 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1072 i->setSrc(0, i->getDef(0));
1073 break;
1074 case OP_POW:
1075 return handlePOW(i);
1076 case OP_DIV:
1077 return handleDIV(i);
1078 case OP_MOD:
1079 return handleMOD(i);
1080 case OP_SQRT:
1081 return handleSQRT(i);
1082 case OP_EXPORT:
1083 return handleEXPORT(i);
1084 case OP_EMIT:
1085 case OP_RESTART:
1086 return handleOUT(i);
1087 case OP_RDSV:
1088 return handleRDSV(i);
1089 case OP_WRSV:
1090 return handleWRSV(i);
1091 case OP_LOAD:
1092 if (i->src(0).getFile() == FILE_SHADER_INPUT) {
1093 i->op = OP_VFETCH;
1094 assert(prog->getType() != Program::TYPE_FRAGMENT);
1095 }
1096 break;
1097 default:
1098 break;
1099 }
1100 return true;
1101 }
1102
1103 bool
runLegalizePass(Program * prog,CGStage stage) const1104 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
1105 {
1106 if (stage == CG_STAGE_PRE_SSA) {
1107 NVC0LoweringPass pass(prog);
1108 return pass.run(prog, false, true);
1109 } else
1110 if (stage == CG_STAGE_POST_RA) {
1111 NVC0LegalizePostRA pass(prog);
1112 return pass.run(prog, false, true);
1113 } else
1114 if (stage == CG_STAGE_SSA) {
1115 NVC0LegalizeSSA pass;
1116 return pass.run(prog, false, true);
1117 }
1118 return false;
1119 }
1120
1121 } // namespace nv50_ir
1122