1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
25
26 #include "codegen/nv50_ir_target_nvc0.h"
27 #include "codegen/nv50_ir_lowering_nvc0.h"
28
29 #include <limits>
30
31 namespace nv50_ir {
32
33 #define QOP_ADD 0
34 #define QOP_SUBR 1
35 #define QOP_SUB 2
36 #define QOP_MOV2 3
37
38 // UL UR LL LR
39 #define QUADOP(q, r, s, t) \
40 ((QOP_##q << 6) | (QOP_##r << 4) | \
41 (QOP_##s << 2) | (QOP_##t << 0))
42
43 void
handleDIV(Instruction * i)44 NVC0LegalizeSSA::handleDIV(Instruction *i)
45 {
46 FlowInstruction *call;
47 int builtin;
48 Value *def[2];
49
50 bld.setPosition(i, false);
51 def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
52 def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
53 switch (i->dType) {
54 case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
55 case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
56 default:
57 return;
58 }
59 call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
60 bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
61 bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
62 bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
63
64 call->fixed = 1;
65 call->absolute = call->builtin = 1;
66 call->target.builtin = builtin;
67 delete_Instruction(prog, i);
68 }
69
70 void
handleRCPRSQ(Instruction * i)71 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
72 {
73 assert(i->dType == TYPE_F64);
74 // There are instructions that will compute the high 32 bits of the 64-bit
75 // float. We will just stick 0 in the bottom 32 bits.
76
77 bld.setPosition(i, false);
78
79 // 1. Take the source and it up.
80 Value *src[2], *dst[2], *def = i->getDef(0);
81 bld.mkSplit(src, 4, i->getSrc(0));
82
83 // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
84 dst[0] = bld.loadImm(NULL, 0);
85 dst[1] = bld.getSSA();
86
87 // 3. The new version of the instruction takes the high 32 bits of the
88 // source and outputs the high 32 bits of the destination.
89 i->setSrc(0, src[1]);
90 i->setDef(0, dst[1]);
91 i->setType(TYPE_F32);
92 i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
93
94 // 4. Recombine the two dst pieces back into the original destination.
95 bld.setPosition(i, true);
96 bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
97 }
98
99 void
handleFTZ(Instruction * i)100 NVC0LegalizeSSA::handleFTZ(Instruction *i)
101 {
102 // Only want to flush float inputs
103 assert(i->sType == TYPE_F32);
104
105 // If we're already flushing denorms (and NaN's) to zero, no need for this.
106 if (i->dnz)
107 return;
108
109 // Only certain classes of operations can flush
110 OpClass cls = prog->getTarget()->getOpClass(i->op);
111 if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
112 cls != OPCLASS_CONVERT)
113 return;
114
115 i->ftz = true;
116 }
117
118 void
handleTEXLOD(TexInstruction * i)119 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
120 {
121 if (i->tex.levelZero)
122 return;
123
124 ImmediateValue lod;
125
126 // The LOD argument comes right after the coordinates (before depth bias,
127 // offsets, etc).
128 int arg = i->tex.target.getArgCount();
129
130 // SM30+ stores the indirect handle as a separate arg, which comes before
131 // the LOD.
132 if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
133 i->tex.rIndirectSrc >= 0)
134 arg++;
135 // SM20 stores indirect handle combined with array coordinate
136 if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
137 !i->tex.target.isArray() &&
138 i->tex.rIndirectSrc >= 0)
139 arg++;
140
141 if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
142 return;
143
144 if (i->op == OP_TXL)
145 i->op = OP_TEX;
146 i->tex.levelZero = true;
147 i->moveSources(arg + 1, -1);
148 }
149
150 bool
visit(Function * fn)151 NVC0LegalizeSSA::visit(Function *fn)
152 {
153 bld.setProgram(fn->getProgram());
154 return true;
155 }
156
157 bool
visit(BasicBlock * bb)158 NVC0LegalizeSSA::visit(BasicBlock *bb)
159 {
160 Instruction *next;
161 for (Instruction *i = bb->getEntry(); i; i = next) {
162 next = i->next;
163
164 if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
165 handleFTZ(i);
166
167 switch (i->op) {
168 case OP_DIV:
169 case OP_MOD:
170 if (i->sType != TYPE_F32)
171 handleDIV(i);
172 break;
173 case OP_RCP:
174 case OP_RSQ:
175 if (i->dType == TYPE_F64)
176 handleRCPRSQ(i);
177 break;
178 case OP_TXL:
179 case OP_TXF:
180 handleTEXLOD(i->asTex());
181 break;
182 default:
183 break;
184 }
185 }
186 return true;
187 }
188
NVC0LegalizePostRA(const Program * prog)189 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
190 : rZero(NULL),
191 carry(NULL),
192 pOne(NULL),
193 needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
194 prog->getTarget()->getChipset() < 0x110)
195 {
196 }
197
198 bool
insnDominatedBy(const Instruction * later,const Instruction * early) const199 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
200 const Instruction *early) const
201 {
202 if (early->bb == later->bb)
203 return early->serial < later->serial;
204 return later->bb->dominatedBy(early->bb);
205 }
206
207 void
addTexUse(std::list<TexUse> & uses,Instruction * usei,const Instruction * texi)208 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
209 Instruction *usei, const Instruction *texi)
210 {
211 bool add = true;
212 bool dominated = insnDominatedBy(usei, texi);
213 // Uses before the tex have to all be included. Just because an earlier
214 // instruction dominates another instruction doesn't mean that there's no
215 // way to get from the tex to the later instruction. For example you could
216 // have nested loops, with the tex in the inner loop, and uses before it in
217 // both loops - even though the outer loop's instruction would dominate the
218 // inner's, we still want a texbar before the inner loop's instruction.
219 //
220 // However we can still use the eliding logic between uses dominated by the
221 // tex instruction, as that is unambiguously correct.
222 if (dominated) {
223 for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
224 if (it->after) {
225 if (insnDominatedBy(usei, it->insn)) {
226 add = false;
227 break;
228 }
229 if (insnDominatedBy(it->insn, usei)) {
230 it = uses.erase(it);
231 continue;
232 }
233 }
234 ++it;
235 }
236 }
237 if (add)
238 uses.push_back(TexUse(usei, texi, dominated));
239 }
240
241 // While it might be tempting to use the an algorithm that just looks at tex
242 // uses, not all texture results are guaranteed to be used on all paths. In
243 // the case where along some control flow path a texture result is never used,
244 // we might reuse that register for something else, creating a
245 // write-after-write hazard. So we have to manually look through all
246 // instructions looking for ones that reference the registers in question.
247 void
findFirstUses(Instruction * texi,std::list<TexUse> & uses)248 NVC0LegalizePostRA::findFirstUses(
249 Instruction *texi, std::list<TexUse> &uses)
250 {
251 int minGPR = texi->def(0).rep()->reg.data.id;
252 int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
253
254 unordered_set<const BasicBlock *> visited;
255 findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
256 }
257
258 void
findFirstUsesBB(int minGPR,int maxGPR,Instruction * start,const Instruction * texi,std::list<TexUse> & uses,unordered_set<const BasicBlock * > & visited)259 NVC0LegalizePostRA::findFirstUsesBB(
260 int minGPR, int maxGPR, Instruction *start,
261 const Instruction *texi, std::list<TexUse> &uses,
262 unordered_set<const BasicBlock *> &visited)
263 {
264 const BasicBlock *bb = start->bb;
265
266 // We don't process the whole bb the first time around. This is correct,
267 // however we might be in a loop and hit this BB again, and need to process
268 // the full thing. So only mark a bb as visited if we processed it from the
269 // beginning.
270 if (start == bb->getEntry()) {
271 if (visited.find(bb) != visited.end())
272 return;
273 visited.insert(bb);
274 }
275
276 for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
277 if (insn->isNop())
278 continue;
279
280 for (int d = 0; insn->defExists(d); ++d) {
281 const Value *def = insn->def(d).rep();
282 if (insn->def(d).getFile() != FILE_GPR ||
283 def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
284 def->reg.data.id > maxGPR)
285 continue;
286 addTexUse(uses, insn, texi);
287 return;
288 }
289
290 for (int s = 0; insn->srcExists(s); ++s) {
291 const Value *src = insn->src(s).rep();
292 if (insn->src(s).getFile() != FILE_GPR ||
293 src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
294 src->reg.data.id > maxGPR)
295 continue;
296 addTexUse(uses, insn, texi);
297 return;
298 }
299 }
300
301 for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
302 findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
303 texi, uses, visited);
304 }
305 }
306
307 // Texture barriers:
308 // This pass is a bit long and ugly and can probably be optimized.
309 //
310 // 1. obtain a list of TEXes and their outputs' first use(s)
311 // 2. calculate the barrier level of each first use (minimal number of TEXes,
312 // over all paths, between the TEX and the use in question)
313 // 3. for each barrier, if all paths from the source TEX to that barrier
314 // contain a barrier of lesser level, it can be culled
315 bool
insertTextureBarriers(Function * fn)316 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
317 {
318 std::list<TexUse> *uses;
319 std::vector<Instruction *> texes;
320 std::vector<int> bbFirstTex;
321 std::vector<int> bbFirstUse;
322 std::vector<int> texCounts;
323 std::vector<TexUse> useVec;
324 ArrayList insns;
325
326 fn->orderInstructions(insns);
327
328 texCounts.resize(fn->allBBlocks.getSize(), 0);
329 bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
330 bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
331
332 // tag BB CFG nodes by their id for later
333 for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
334 BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
335 if (bb)
336 bb->cfg.tag = bb->getId();
337 }
338
339 // gather the first uses for each TEX
340 for (int i = 0; i < insns.getSize(); ++i) {
341 Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
342 if (isTextureOp(tex->op)) {
343 texes.push_back(tex);
344 if (!texCounts.at(tex->bb->getId()))
345 bbFirstTex[tex->bb->getId()] = texes.size() - 1;
346 texCounts[tex->bb->getId()]++;
347 }
348 }
349 insns.clear();
350 if (texes.empty())
351 return false;
352 uses = new std::list<TexUse>[texes.size()];
353 if (!uses)
354 return false;
355 for (size_t i = 0; i < texes.size(); ++i) {
356 findFirstUses(texes[i], uses[i]);
357 }
358
359 // determine the barrier level at each use
360 for (size_t i = 0; i < texes.size(); ++i) {
361 for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
362 ++u) {
363 BasicBlock *tb = texes[i]->bb;
364 BasicBlock *ub = u->insn->bb;
365 if (tb == ub) {
366 u->level = 0;
367 for (size_t j = i + 1; j < texes.size() &&
368 texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
369 ++j)
370 u->level++;
371 } else {
372 u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
373 &ub->cfg, texCounts);
374 if (u->level < 0) {
375 WARN("Failed to find path TEX -> TEXBAR\n");
376 u->level = 0;
377 continue;
378 }
379 // this counted all TEXes in the origin block, correct that
380 u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
381 // and did not count the TEXes in the destination block, add those
382 for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
383 texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
384 ++j)
385 u->level++;
386 }
387 assert(u->level >= 0);
388 useVec.push_back(*u);
389 }
390 }
391 delete[] uses;
392
393 // insert the barriers
394 for (size_t i = 0; i < useVec.size(); ++i) {
395 Instruction *prev = useVec[i].insn->prev;
396 if (useVec[i].level < 0)
397 continue;
398 if (prev && prev->op == OP_TEXBAR) {
399 if (prev->subOp > useVec[i].level)
400 prev->subOp = useVec[i].level;
401 prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
402 } else {
403 Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
404 bar->fixed = 1;
405 bar->subOp = useVec[i].level;
406 // make use explicit to ease latency calculation
407 bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
408 useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
409 }
410 }
411
412 if (fn->getProgram()->optLevel < 3)
413 return true;
414
415 std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
416
417 limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
418 limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
419 limitS.resize(fn->allBBlocks.getSize());
420
421 // cull unneeded barriers (should do that earlier, but for simplicity)
422 IteratorRef bi = fn->cfg.iteratorCFG();
423 // first calculate min/max outstanding TEXes for each BB
424 for (bi->reset(); !bi->end(); bi->next()) {
425 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
426 BasicBlock *bb = BasicBlock::get(n);
427 int min = 0;
428 int max = std::numeric_limits<int>::max();
429 for (Instruction *i = bb->getFirst(); i; i = i->next) {
430 if (isTextureOp(i->op)) {
431 min++;
432 if (max < std::numeric_limits<int>::max())
433 max++;
434 } else
435 if (i->op == OP_TEXBAR) {
436 min = MIN2(min, i->subOp);
437 max = MIN2(max, i->subOp);
438 }
439 }
440 // limits when looking at an isolated block
441 limitS[bb->getId()].min = min;
442 limitS[bb->getId()].max = max;
443 }
444 // propagate the min/max values
445 for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
446 for (bi->reset(); !bi->end(); bi->next()) {
447 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
448 BasicBlock *bb = BasicBlock::get(n);
449 const int bbId = bb->getId();
450 for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
451 BasicBlock *in = BasicBlock::get(ei.getNode());
452 const int inId = in->getId();
453 limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
454 limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
455 }
456 // I just hope this is correct ...
457 if (limitS[bbId].max == std::numeric_limits<int>::max()) {
458 // no barrier
459 limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
460 limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
461 } else {
462 // block contained a barrier
463 limitB[bbId].min = MIN2(limitS[bbId].max,
464 limitT[bbId].min + limitS[bbId].min);
465 limitB[bbId].max = MIN2(limitS[bbId].max,
466 limitT[bbId].max + limitS[bbId].min);
467 }
468 }
469 }
470 // finally delete unnecessary barriers
471 for (bi->reset(); !bi->end(); bi->next()) {
472 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
473 BasicBlock *bb = BasicBlock::get(n);
474 Instruction *prev = NULL;
475 Instruction *next;
476 int max = limitT[bb->getId()].max;
477 for (Instruction *i = bb->getFirst(); i; i = next) {
478 next = i->next;
479 if (i->op == OP_TEXBAR) {
480 if (i->subOp >= max) {
481 delete_Instruction(prog, i);
482 i = NULL;
483 } else {
484 max = i->subOp;
485 if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
486 delete_Instruction(prog, prev);
487 prev = NULL;
488 }
489 }
490 } else
491 if (isTextureOp(i->op)) {
492 max++;
493 }
494 if (i && !i->isNop())
495 prev = i;
496 }
497 }
498 return true;
499 }
500
501 bool
visit(Function * fn)502 NVC0LegalizePostRA::visit(Function *fn)
503 {
504 if (needTexBar)
505 insertTextureBarriers(fn);
506
507 rZero = new_LValue(fn, FILE_GPR);
508 pOne = new_LValue(fn, FILE_PREDICATE);
509 carry = new_LValue(fn, FILE_FLAGS);
510
511 rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
512 carry->reg.data.id = 0;
513 pOne->reg.data.id = 7;
514
515 return true;
516 }
517
518 void
replaceZero(Instruction * i)519 NVC0LegalizePostRA::replaceZero(Instruction *i)
520 {
521 for (int s = 0; i->srcExists(s); ++s) {
522 if (s == 2 && i->op == OP_SUCLAMP)
523 continue;
524 ImmediateValue *imm = i->getSrc(s)->asImm();
525 if (imm) {
526 if (i->op == OP_SELP && s == 2) {
527 i->setSrc(s, pOne);
528 if (imm->reg.data.u64 == 0)
529 i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
530 } else if (imm->reg.data.u64 == 0) {
531 i->setSrc(s, rZero);
532 }
533 }
534 }
535 }
536
537 // replace CONT with BRA for single unconditional continue
538 bool
tryReplaceContWithBra(BasicBlock * bb)539 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
540 {
541 if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
542 return false;
543 Graph::EdgeIterator ei = bb->cfg.incident();
544 if (ei.getType() != Graph::Edge::BACK)
545 ei.next();
546 if (ei.getType() != Graph::Edge::BACK)
547 return false;
548 BasicBlock *contBB = BasicBlock::get(ei.getNode());
549
550 if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
551 contBB->getExit()->getPredicate())
552 return false;
553 contBB->getExit()->op = OP_BRA;
554 bb->remove(bb->getEntry()); // delete PRECONT
555
556 ei.next();
557 assert(ei.end() || ei.getType() != Graph::Edge::BACK);
558 return true;
559 }
560
561 // replace branches to join blocks with join ops
562 void
propagateJoin(BasicBlock * bb)563 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
564 {
565 if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
566 return;
567 for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
568 BasicBlock *in = BasicBlock::get(ei.getNode());
569 Instruction *exit = in->getExit();
570 if (!exit) {
571 in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
572 // there should always be a terminator instruction
573 WARN("inserted missing terminator in BB:%i\n", in->getId());
574 } else
575 if (exit->op == OP_BRA) {
576 exit->op = OP_JOIN;
577 exit->asFlow()->limit = 1; // must-not-propagate marker
578 }
579 }
580 bb->remove(bb->getEntry());
581 }
582
583 bool
visit(BasicBlock * bb)584 NVC0LegalizePostRA::visit(BasicBlock *bb)
585 {
586 Instruction *i, *next;
587
588 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
589 for (i = bb->getFirst(); i; i = next) {
590 next = i->next;
591 if (i->op == OP_EMIT || i->op == OP_RESTART) {
592 if (!i->getDef(0)->refCount())
593 i->setDef(0, NULL);
594 if (i->src(0).getFile() == FILE_IMMEDIATE)
595 i->setSrc(0, rZero); // initial value must be 0
596 replaceZero(i);
597 } else
598 if (i->isNop()) {
599 bb->remove(i);
600 } else
601 if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
602 prog->getType() != Program::TYPE_COMPUTE) {
603 // It seems like barriers are never required for tessellation since
604 // the warp size is 32, and there are always at most 32 tcs threads.
605 bb->remove(i);
606 } else
607 if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
608 int offset = i->src(0).get()->reg.data.offset;
609 if (abs(offset) > 0x10000)
610 i->src(0).get()->reg.fileIndex += offset >> 16;
611 i->src(0).get()->reg.data.offset = (int)(short)offset;
612 } else {
613 // TODO: Move this to before register allocation for operations that
614 // need the $c register !
615 if (typeSizeof(i->dType) == 8) {
616 Instruction *hi;
617 hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
618 if (hi)
619 next = hi;
620 }
621
622 if (i->op != OP_MOV && i->op != OP_PFETCH)
623 replaceZero(i);
624 }
625 }
626 if (!bb->getEntry())
627 return true;
628
629 if (!tryReplaceContWithBra(bb))
630 propagateJoin(bb);
631
632 return true;
633 }
634
NVC0LoweringPass(Program * prog)635 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
636 {
637 bld.setProgram(prog);
638 }
639
640 bool
visit(Function * fn)641 NVC0LoweringPass::visit(Function *fn)
642 {
643 if (prog->getType() == Program::TYPE_GEOMETRY) {
644 assert(!strncmp(fn->getName(), "MAIN", 4));
645 // TODO: when we generate actual functions pass this value along somehow
646 bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
647 gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
648 if (fn->cfgExit) {
649 bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
650 bld.mkMovToReg(0, gpEmitAddress);
651 }
652 }
653 return true;
654 }
655
656 bool
visit(BasicBlock * bb)657 NVC0LoweringPass::visit(BasicBlock *bb)
658 {
659 return true;
660 }
661
662 inline Value *
loadTexHandle(Value * ptr,unsigned int slot)663 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
664 {
665 uint8_t b = prog->driver->io.auxCBSlot;
666 uint32_t off = prog->driver->io.texBindBase + slot * 4;
667
668 if (ptr)
669 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
670
671 return bld.
672 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
673 }
674
675 // move array source to first slot, convert to u16, add indirections
676 bool
handleTEX(TexInstruction * i)677 NVC0LoweringPass::handleTEX(TexInstruction *i)
678 {
679 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
680 const int arg = i->tex.target.getArgCount();
681 const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
682 const int chipset = prog->getTarget()->getChipset();
683
684 /* Only normalize in the non-explicit derivatives case. For explicit
685 * derivatives, this is handled in handleManualTXD.
686 */
687 if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
688 Value *src[3], *val;
689 int c;
690 for (c = 0; c < 3; ++c)
691 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
692 val = bld.getScratch();
693 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
694 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
695 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
696 for (c = 0; c < 3; ++c) {
697 i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
698 i->getSrc(c), val));
699 }
700 }
701
702 // Arguments to the TEX instruction are a little insane. Even though the
703 // encoding is identical between SM20 and SM30, the arguments mean
704 // different things between Fermi and Kepler+. A lot of arguments are
705 // optional based on flags passed to the instruction. This summarizes the
706 // order of things.
707 //
708 // Fermi:
709 // array/indirect
710 // coords
711 // sample
712 // lod bias
713 // depth compare
714 // offsets:
715 // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
716 // - other: 4 bits each, single reg
717 //
718 // Kepler+:
719 // indirect handle
720 // array (+ offsets for txd in upper 16 bits)
721 // coords
722 // sample
723 // lod bias
724 // depth compare
725 // offsets (same as fermi, except txd which takes it with array)
726 //
727 // Maxwell (tex):
728 // array
729 // coords
730 // indirect handle
731 // sample
732 // lod bias
733 // depth compare
734 // offsets
735 //
736 // Maxwell (txd):
737 // indirect handle
738 // coords
739 // array + offsets
740 // derivatives
741
742 if (chipset >= NVISA_GK104_CHIPSET) {
743 if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
744 // XXX this ignores tsc, and assumes a 1:1 mapping
745 assert(i->tex.rIndirectSrc >= 0);
746 Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
747 i->tex.r = 0xff;
748 i->tex.s = 0x1f;
749 i->setIndirectR(hnd);
750 i->setIndirectS(NULL);
751 } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
752 if (i->tex.r == 0xffff)
753 i->tex.r = prog->driver->io.fbtexBindBase / 4;
754 else
755 i->tex.r += prog->driver->io.texBindBase / 4;
756 i->tex.s = 0; // only a single cX[] value possible here
757 } else {
758 Value *hnd = bld.getScratch();
759 Value *rHnd = loadTexHandle(NULL, i->tex.r);
760 Value *sHnd = loadTexHandle(NULL, i->tex.s);
761
762 bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
763
764 i->tex.r = 0; // not used for indirect tex
765 i->tex.s = 0;
766 i->setIndirectR(hnd);
767 }
768 if (i->tex.target.isArray()) {
769 LValue *layer = new_LValue(func, FILE_GPR);
770 Value *src = i->getSrc(lyr);
771 const int sat = (i->op == OP_TXF) ? 1 : 0;
772 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
773 bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
774 if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
775 for (int s = dim; s >= 1; --s)
776 i->setSrc(s, i->getSrc(s - 1));
777 i->setSrc(0, layer);
778 } else {
779 i->setSrc(dim, layer);
780 }
781 }
782 // Move the indirect reference to the first place
783 if (i->tex.rIndirectSrc >= 0 && (
784 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
785 Value *hnd = i->getIndirectR();
786
787 i->setIndirectR(NULL);
788 i->moveSources(0, 1);
789 i->setSrc(0, hnd);
790 i->tex.rIndirectSrc = 0;
791 i->tex.sIndirectSrc = -1;
792 }
793 // Move the indirect reference to right after the coords
794 else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
795 Value *hnd = i->getIndirectR();
796
797 i->setIndirectR(NULL);
798 i->moveSources(arg, 1);
799 i->setSrc(arg, hnd);
800 i->tex.rIndirectSrc = 0;
801 i->tex.sIndirectSrc = -1;
802 }
803 } else
804 // (nvc0) generate and move the tsc/tic/array source to the front
805 if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
806 LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
807
808 Value *ticRel = i->getIndirectR();
809 Value *tscRel = i->getIndirectS();
810
811 if (i->tex.r == 0xffff) {
812 i->tex.r = 0x20;
813 i->tex.s = 0x10;
814 }
815
816 if (ticRel) {
817 i->setSrc(i->tex.rIndirectSrc, NULL);
818 if (i->tex.r)
819 ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
820 ticRel, bld.mkImm(i->tex.r));
821 }
822 if (tscRel) {
823 i->setSrc(i->tex.sIndirectSrc, NULL);
824 if (i->tex.s)
825 tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
826 tscRel, bld.mkImm(i->tex.s));
827 }
828
829 Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
830 if (arrayIndex) {
831 for (int s = dim; s >= 1; --s)
832 i->setSrc(s, i->getSrc(s - 1));
833 i->setSrc(0, arrayIndex);
834 } else {
835 i->moveSources(0, 1);
836 }
837
838 if (arrayIndex) {
839 int sat = (i->op == OP_TXF) ? 1 : 0;
840 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
841 bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
842 } else {
843 bld.loadImm(src, 0);
844 }
845
846 if (ticRel)
847 bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
848 if (tscRel)
849 bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
850
851 i->setSrc(0, src);
852 }
853
854 // For nvc0, the sample id has to be in the second operand, as the offset
855 // does. Right now we don't know how to pass both in, and this case can't
856 // happen with OpenGL. On nve0, the sample id is part of the texture
857 // coordinate argument.
858 assert(chipset >= NVISA_GK104_CHIPSET ||
859 !i->tex.useOffsets || !i->tex.target.isMS());
860
861 // offset is between lod and dc
862 if (i->tex.useOffsets) {
863 int n, c;
864 int s = i->srcCount(0xff, true);
865 if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
866 if (i->tex.target.isShadow())
867 s--;
868 if (i->srcExists(s)) // move potential predicate out of the way
869 i->moveSources(s, 1);
870 if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
871 i->moveSources(s + 1, 1);
872 }
873 if (i->op == OP_TXG) {
874 // Either there is 1 offset, which goes into the 2 low bytes of the
875 // first source, or there are 4 offsets, which go into 2 sources (8
876 // values, 1 byte each).
877 Value *offs[2] = {NULL, NULL};
878 for (n = 0; n < i->tex.useOffsets; n++) {
879 for (c = 0; c < 2; ++c) {
880 if ((n % 2) == 0 && c == 0)
881 bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
882 else
883 bld.mkOp3(OP_INSBF, TYPE_U32,
884 offs[n / 2],
885 i->offset[n][c].get(),
886 bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
887 offs[n / 2]);
888 }
889 }
890 i->setSrc(s, offs[0]);
891 if (offs[1])
892 i->setSrc(s + 1, offs[1]);
893 } else {
894 unsigned imm = 0;
895 assert(i->tex.useOffsets == 1);
896 for (c = 0; c < 3; ++c) {
897 ImmediateValue val;
898 if (!i->offset[0][c].getImmediate(val))
899 assert(!"non-immediate offset passed to non-TXG");
900 imm |= (val.reg.data.u32 & 0xf) << (c * 4);
901 }
902 if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
903 // The offset goes into the upper 16 bits of the array index. So
904 // create it if it's not already there, and INSBF it if it already
905 // is.
906 s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
907 if (chipset >= NVISA_GM107_CHIPSET)
908 s += dim;
909 if (i->tex.target.isArray()) {
910 bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
911 bld.loadImm(NULL, imm), bld.mkImm(0xc10),
912 i->getSrc(s));
913 } else {
914 i->moveSources(s, 1);
915 i->setSrc(s, bld.loadImm(NULL, imm << 16));
916 }
917 } else {
918 i->setSrc(s, bld.loadImm(NULL, imm));
919 }
920 }
921 }
922
923 if (chipset >= NVISA_GK104_CHIPSET) {
924 //
925 // If TEX requires more than 4 sources, the 2nd register tuple must be
926 // aligned to 4, even if it consists of just a single 4-byte register.
927 //
928 // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
929 //
930 int s = i->srcCount(0xff, true);
931 if (s > 4 && s < 7) {
932 if (i->srcExists(s)) // move potential predicate out of the way
933 i->moveSources(s, 7 - s);
934 while (s < 7)
935 i->setSrc(s++, bld.loadImm(NULL, 0));
936 }
937 }
938
939 return true;
940 }
941
942 bool
handleManualTXD(TexInstruction * i)943 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
944 {
945 static const uint8_t qOps[4][2] =
946 {
947 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
948 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
949 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
950 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
951 };
952 Value *def[4][4];
953 Value *crd[3];
954 Instruction *tex;
955 Value *zero = bld.loadImm(bld.getSSA(), 0);
956 int l, c;
957 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
958
959 // This function is invoked after handleTEX lowering, so we have to expect
960 // the arguments in the order that the hw wants them. For Fermi, array and
961 // indirect are both in the leading arg, while for Kepler, array and
962 // indirect are separate (and both precede the coordinates). Maxwell is
963 // handled in a separate function.
964 unsigned array;
965 if (targ->getChipset() < NVISA_GK104_CHIPSET)
966 array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
967 else
968 array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
969
970 i->op = OP_TEX; // no need to clone dPdx/dPdy later
971
972 for (c = 0; c < dim; ++c)
973 crd[c] = bld.getScratch();
974
975 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
976 for (l = 0; l < 4; ++l) {
977 Value *src[3], *val;
978 // mov coordinates from lane l to all lanes
979 for (c = 0; c < dim; ++c)
980 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
981 // add dPdx from lane l to lanes dx
982 for (c = 0; c < dim; ++c)
983 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
984 // add dPdy from lane l to lanes dy
985 for (c = 0; c < dim; ++c)
986 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
987 // normalize cube coordinates
988 if (i->tex.target.isCube()) {
989 for (c = 0; c < 3; ++c)
990 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
991 val = bld.getScratch();
992 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
993 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
994 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
995 for (c = 0; c < 3; ++c)
996 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
997 } else {
998 for (c = 0; c < dim; ++c)
999 src[c] = crd[c];
1000 }
1001 // texture
1002 bld.insert(tex = cloneForward(func, i));
1003 for (c = 0; c < dim; ++c)
1004 tex->setSrc(c + array, src[c]);
1005 // save results
1006 for (c = 0; i->defExists(c); ++c) {
1007 Instruction *mov;
1008 def[c][l] = bld.getSSA();
1009 mov = bld.mkMov(def[c][l], tex->getDef(c));
1010 mov->fixed = 1;
1011 mov->lanes = 1 << l;
1012 }
1013 }
1014 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1015
1016 for (c = 0; i->defExists(c); ++c) {
1017 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1018 for (l = 0; l < 4; ++l)
1019 u->setSrc(l, def[c][l]);
1020 }
1021
1022 i->bb->remove(i);
1023 return true;
1024 }
1025
1026 bool
handleTXD(TexInstruction * txd)1027 NVC0LoweringPass::handleTXD(TexInstruction *txd)
1028 {
1029 int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1030 unsigned arg = txd->tex.target.getArgCount();
1031 unsigned expected_args = arg;
1032 const int chipset = prog->getTarget()->getChipset();
1033
1034 if (chipset >= NVISA_GK104_CHIPSET) {
1035 if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1036 expected_args++;
1037 if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1038 expected_args++;
1039 } else {
1040 if (txd->tex.useOffsets)
1041 expected_args++;
1042 if (!txd->tex.target.isArray() && (
1043 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1044 expected_args++;
1045 }
1046
1047 if (expected_args > 4 ||
1048 dim > 2 ||
1049 txd->tex.target.isShadow())
1050 txd->op = OP_TEX;
1051
1052 handleTEX(txd);
1053 while (txd->srcExists(arg))
1054 ++arg;
1055
1056 txd->tex.derivAll = true;
1057 if (txd->op == OP_TEX)
1058 return handleManualTXD(txd);
1059
1060 assert(arg == expected_args);
1061 for (int c = 0; c < dim; ++c) {
1062 txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1063 txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1064 txd->dPdx[c].set(NULL);
1065 txd->dPdy[c].set(NULL);
1066 }
1067
1068 // In this case we have fewer than 4 "real" arguments, which means that
1069 // handleTEX didn't apply any padding. However we have to make sure that
1070 // the second "group" of arguments still gets padded up to 4.
1071 if (chipset >= NVISA_GK104_CHIPSET) {
1072 int s = arg + 2 * dim;
1073 if (s >= 4 && s < 7) {
1074 if (txd->srcExists(s)) // move potential predicate out of the way
1075 txd->moveSources(s, 7 - s);
1076 while (s < 7)
1077 txd->setSrc(s++, bld.loadImm(NULL, 0));
1078 }
1079 }
1080
1081 return true;
1082 }
1083
1084 bool
handleTXQ(TexInstruction * txq)1085 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1086 {
1087 const int chipset = prog->getTarget()->getChipset();
1088 if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1089 txq->tex.r += prog->driver->io.texBindBase / 4;
1090
1091 if (txq->tex.rIndirectSrc < 0)
1092 return true;
1093
1094 Value *ticRel = txq->getIndirectR();
1095
1096 txq->setIndirectS(NULL);
1097 txq->tex.sIndirectSrc = -1;
1098
1099 assert(ticRel);
1100
1101 if (chipset < NVISA_GK104_CHIPSET) {
1102 LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1103
1104 txq->setSrc(txq->tex.rIndirectSrc, NULL);
1105 if (txq->tex.r)
1106 ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1107 ticRel, bld.mkImm(txq->tex.r));
1108
1109 bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1110
1111 txq->moveSources(0, 1);
1112 txq->setSrc(0, src);
1113 } else {
1114 Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1115 txq->tex.r = 0xff;
1116 txq->tex.s = 0x1f;
1117
1118 txq->setIndirectR(NULL);
1119 txq->moveSources(0, 1);
1120 txq->setSrc(0, hnd);
1121 txq->tex.rIndirectSrc = 0;
1122 }
1123
1124 return true;
1125 }
1126
1127 bool
handleTXLQ(TexInstruction * i)1128 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1129 {
1130 /* The outputs are inverted compared to what the TGSI instruction
1131 * expects. Take that into account in the mask.
1132 */
1133 assert((i->tex.mask & ~3) == 0);
1134 if (i->tex.mask == 1)
1135 i->tex.mask = 2;
1136 else if (i->tex.mask == 2)
1137 i->tex.mask = 1;
1138 handleTEX(i);
1139 bld.setPosition(i, true);
1140
1141 /* The returned values are not quite what we want:
1142 * (a) convert from s16/u16 to f32
1143 * (b) multiply by 1/256
1144 */
1145 for (int def = 0; def < 2; ++def) {
1146 if (!i->defExists(def))
1147 continue;
1148 enum DataType type = TYPE_S16;
1149 if (i->tex.mask == 2 || def > 0)
1150 type = TYPE_U16;
1151 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1152 bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1153 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1154 }
1155 if (i->tex.mask == 3) {
1156 LValue *t = new_LValue(func, FILE_GPR);
1157 bld.mkMov(t, i->getDef(0));
1158 bld.mkMov(i->getDef(0), i->getDef(1));
1159 bld.mkMov(i->getDef(1), t);
1160 }
1161 return true;
1162 }
1163
1164 bool
handleBUFQ(Instruction * bufq)1165 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1166 {
1167 bufq->op = OP_MOV;
1168 bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1169 bufq->getSrc(0)->reg.fileIndex * 16));
1170 bufq->setIndirect(0, 0, NULL);
1171 bufq->setIndirect(0, 1, NULL);
1172 return true;
1173 }
1174
1175 void
handleSharedATOMNVE4(Instruction * atom)1176 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1177 {
1178 assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1179
1180 BasicBlock *currBB = atom->bb;
1181 BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1182 BasicBlock *joinBB = atom->bb->splitAfter(atom);
1183 BasicBlock *setAndUnlockBB = new BasicBlock(func);
1184 BasicBlock *failLockBB = new BasicBlock(func);
1185
1186 bld.setPosition(currBB, true);
1187 assert(!currBB->joinAt);
1188 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1189
1190 CmpInstruction *pred =
1191 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1192 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1193
1194 bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1195 currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1196
1197 bld.setPosition(tryLockBB, true);
1198
1199 Instruction *ld =
1200 bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1201 atom->getIndirect(0, 0));
1202 ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1203 ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1204
1205 bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1206 bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1207 tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1208 tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1209
1210 tryLockBB->cfg.detach(&joinBB->cfg);
1211 bld.remove(atom);
1212
1213 bld.setPosition(setAndUnlockBB, true);
1214 Value *stVal;
1215 if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1216 // Read the old value, and write the new one.
1217 stVal = atom->getSrc(1);
1218 } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1219 CmpInstruction *set =
1220 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1221 TYPE_U32, ld->getDef(0), atom->getSrc(1));
1222
1223 bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1224 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1225 } else {
1226 operation op;
1227
1228 switch (atom->subOp) {
1229 case NV50_IR_SUBOP_ATOM_ADD:
1230 op = OP_ADD;
1231 break;
1232 case NV50_IR_SUBOP_ATOM_AND:
1233 op = OP_AND;
1234 break;
1235 case NV50_IR_SUBOP_ATOM_OR:
1236 op = OP_OR;
1237 break;
1238 case NV50_IR_SUBOP_ATOM_XOR:
1239 op = OP_XOR;
1240 break;
1241 case NV50_IR_SUBOP_ATOM_MIN:
1242 op = OP_MIN;
1243 break;
1244 case NV50_IR_SUBOP_ATOM_MAX:
1245 op = OP_MAX;
1246 break;
1247 default:
1248 assert(0);
1249 return;
1250 }
1251
1252 stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1253 atom->getSrc(1));
1254 }
1255
1256 Instruction *st =
1257 bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1258 atom->getIndirect(0, 0), stVal);
1259 st->setDef(0, pred->getDef(0));
1260 st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1261
1262 bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1263 setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1264
1265 // Lock until the store has not been performed.
1266 bld.setPosition(failLockBB, true);
1267 bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1268 bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1269 failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1270 failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1271
1272 bld.setPosition(joinBB, false);
1273 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1274 }
1275
1276 void
handleSharedATOM(Instruction * atom)1277 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1278 {
1279 assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1280
1281 BasicBlock *currBB = atom->bb;
1282 BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1283 BasicBlock *joinBB = atom->bb->splitAfter(atom);
1284
1285 bld.setPosition(currBB, true);
1286 assert(!currBB->joinAt);
1287 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1288
1289 bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1290 currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1291
1292 bld.setPosition(tryLockAndSetBB, true);
1293
1294 Instruction *ld =
1295 bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1296 atom->getIndirect(0, 0));
1297 ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1298 ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1299
1300 Value *stVal;
1301 if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1302 // Read the old value, and write the new one.
1303 stVal = atom->getSrc(1);
1304 } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1305 CmpInstruction *set =
1306 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1307 TYPE_U32, ld->getDef(0), atom->getSrc(1));
1308 set->setPredicate(CC_P, ld->getDef(1));
1309
1310 Instruction *selp =
1311 bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1312 atom->getSrc(2), set->getDef(0));
1313 selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1314 selp->setPredicate(CC_P, ld->getDef(1));
1315
1316 stVal = selp->getDef(0);
1317 } else {
1318 operation op;
1319
1320 switch (atom->subOp) {
1321 case NV50_IR_SUBOP_ATOM_ADD:
1322 op = OP_ADD;
1323 break;
1324 case NV50_IR_SUBOP_ATOM_AND:
1325 op = OP_AND;
1326 break;
1327 case NV50_IR_SUBOP_ATOM_OR:
1328 op = OP_OR;
1329 break;
1330 case NV50_IR_SUBOP_ATOM_XOR:
1331 op = OP_XOR;
1332 break;
1333 case NV50_IR_SUBOP_ATOM_MIN:
1334 op = OP_MIN;
1335 break;
1336 case NV50_IR_SUBOP_ATOM_MAX:
1337 op = OP_MAX;
1338 break;
1339 default:
1340 assert(0);
1341 return;
1342 }
1343
1344 Instruction *i =
1345 bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1346 atom->getSrc(1));
1347 i->setPredicate(CC_P, ld->getDef(1));
1348
1349 stVal = i->getDef(0);
1350 }
1351
1352 Instruction *st =
1353 bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1354 atom->getIndirect(0, 0), stVal);
1355 st->setPredicate(CC_P, ld->getDef(1));
1356 st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1357
1358 // Loop until the lock is acquired.
1359 bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1360 tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1361 tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1362 bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1363
1364 bld.remove(atom);
1365
1366 bld.setPosition(joinBB, false);
1367 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1368 }
1369
1370 bool
handleATOM(Instruction * atom)1371 NVC0LoweringPass::handleATOM(Instruction *atom)
1372 {
1373 SVSemantic sv;
1374 Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1375
1376 switch (atom->src(0).getFile()) {
1377 case FILE_MEMORY_LOCAL:
1378 sv = SV_LBASE;
1379 break;
1380 case FILE_MEMORY_SHARED:
1381 // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1382 // operations on shared memory. For Maxwell, ATOMS is enough.
1383 if (targ->getChipset() < NVISA_GK104_CHIPSET)
1384 handleSharedATOM(atom);
1385 else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1386 handleSharedATOMNVE4(atom);
1387 return true;
1388 default:
1389 assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1390 base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1391 assert(base->reg.size == 8);
1392 if (ptr)
1393 base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1394 assert(base->reg.size == 8);
1395 atom->setIndirect(0, 0, base);
1396 atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1397
1398 // Harden against out-of-bounds accesses
1399 Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1400 Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1401 Value *pred = new_LValue(func, FILE_PREDICATE);
1402 if (ptr)
1403 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1404 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1405 atom->setPredicate(CC_NOT_P, pred);
1406 if (atom->defExists(0)) {
1407 Value *zero, *dst = atom->getDef(0);
1408 atom->setDef(0, bld.getSSA());
1409
1410 bld.setPosition(atom, true);
1411 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1412 ->setPredicate(CC_P, pred);
1413 bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1414 }
1415
1416 return true;
1417 }
1418 base =
1419 bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1420
1421 atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1422 atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1423 if (ptr)
1424 base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1425 atom->setIndirect(0, 1, NULL);
1426 atom->setIndirect(0, 0, base);
1427
1428 return true;
1429 }
1430
1431 bool
handleCasExch(Instruction * cas,bool needCctl)1432 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1433 {
1434 if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1435 if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1436 // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1437 return false;
1438 }
1439 }
1440
1441 if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1442 cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1443 return false;
1444 bld.setPosition(cas, true);
1445
1446 if (needCctl) {
1447 Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1448 cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1449 cctl->fixed = 1;
1450 cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1451 if (cas->isPredicated())
1452 cctl->setPredicate(cas->cc, cas->getPredicate());
1453 }
1454
1455 if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1456 // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1457 // should be set to the high part of the double reg or bad things will
1458 // happen elsewhere in the universe.
1459 // Also, it sometimes returns the new value instead of the old one
1460 // under mysterious circumstances.
1461 Value *dreg = bld.getSSA(8);
1462 bld.setPosition(cas, false);
1463 bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
1464 cas->setSrc(1, dreg);
1465 cas->setSrc(2, dreg);
1466 }
1467
1468 return true;
1469 }
1470
1471 inline Value *
loadResInfo32(Value * ptr,uint32_t off,uint16_t base)1472 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1473 {
1474 uint8_t b = prog->driver->io.auxCBSlot;
1475 off += base;
1476
1477 return bld.
1478 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1479 }
1480
1481 inline Value *
loadResInfo64(Value * ptr,uint32_t off,uint16_t base)1482 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1483 {
1484 uint8_t b = prog->driver->io.auxCBSlot;
1485 off += base;
1486
1487 if (ptr)
1488 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1489
1490 return bld.
1491 mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1492 }
1493
1494 inline Value *
loadResLength32(Value * ptr,uint32_t off,uint16_t base)1495 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1496 {
1497 uint8_t b = prog->driver->io.auxCBSlot;
1498 off += base;
1499
1500 if (ptr)
1501 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1502
1503 return bld.
1504 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1505 }
1506
1507 inline Value *
loadBufInfo64(Value * ptr,uint32_t off)1508 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1509 {
1510 return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1511 }
1512
1513 inline Value *
loadBufLength32(Value * ptr,uint32_t off)1514 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1515 {
1516 return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1517 }
1518
1519 inline Value *
loadUboInfo64(Value * ptr,uint32_t off)1520 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1521 {
1522 return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1523 }
1524
1525 inline Value *
loadUboLength32(Value * ptr,uint32_t off)1526 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1527 {
1528 return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1529 }
1530
1531 inline Value *
loadMsInfo32(Value * ptr,uint32_t off)1532 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1533 {
1534 uint8_t b = prog->driver->io.msInfoCBSlot;
1535 off += prog->driver->io.msInfoBase;
1536 return bld.
1537 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1538 }
1539
1540 /* On nvc0, surface info is obtained via the surface binding points passed
1541 * to the SULD/SUST instructions.
1542 * On nve4, surface info is stored in c[] and is used by various special
1543 * instructions, e.g. for clamping coordinates or generating an address.
1544 * They couldn't just have added an equivalent to TIC now, couldn't they ?
1545 */
1546 #define NVC0_SU_INFO_ADDR 0x00
1547 #define NVC0_SU_INFO_FMT 0x04
1548 #define NVC0_SU_INFO_DIM_X 0x08
1549 #define NVC0_SU_INFO_PITCH 0x0c
1550 #define NVC0_SU_INFO_DIM_Y 0x10
1551 #define NVC0_SU_INFO_ARRAY 0x14
1552 #define NVC0_SU_INFO_DIM_Z 0x18
1553 #define NVC0_SU_INFO_UNK1C 0x1c
1554 #define NVC0_SU_INFO_WIDTH 0x20
1555 #define NVC0_SU_INFO_HEIGHT 0x24
1556 #define NVC0_SU_INFO_DEPTH 0x28
1557 #define NVC0_SU_INFO_TARGET 0x2c
1558 #define NVC0_SU_INFO_BSIZE 0x30
1559 #define NVC0_SU_INFO_RAW_X 0x34
1560 #define NVC0_SU_INFO_MS_X 0x38
1561 #define NVC0_SU_INFO_MS_Y 0x3c
1562
1563 #define NVC0_SU_INFO__STRIDE 0x40
1564
1565 #define NVC0_SU_INFO_DIM(i) (0x08 + (i) * 8)
1566 #define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
1567 #define NVC0_SU_INFO_MS(i) (0x38 + (i) * 4)
1568
1569 inline Value *
loadSuInfo32(Value * ptr,int slot,uint32_t off)1570 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off)
1571 {
1572 uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1573
1574 if (ptr) {
1575 ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1576 ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1577 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1578 base = 0;
1579 }
1580 off += base;
1581
1582 return loadResInfo32(ptr, off, prog->driver->io.suInfoBase);
1583 }
1584
getSuClampSubOp(const TexInstruction * su,int c)1585 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1586 {
1587 switch (su->tex.target.getEnum()) {
1588 case TEX_TARGET_BUFFER: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1589 case TEX_TARGET_RECT: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1590 case TEX_TARGET_1D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1591 case TEX_TARGET_1D_ARRAY: return (c == 1) ?
1592 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1593 NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1594 case TEX_TARGET_2D: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1595 case TEX_TARGET_2D_MS: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1596 case TEX_TARGET_2D_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1597 case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1598 case TEX_TARGET_3D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1599 case TEX_TARGET_CUBE: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1600 case TEX_TARGET_CUBE_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1601 default:
1602 assert(0);
1603 return 0;
1604 }
1605 }
1606
1607 bool
handleSUQ(TexInstruction * suq)1608 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1609 {
1610 int mask = suq->tex.mask;
1611 int dim = suq->tex.target.getDim();
1612 int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1613 Value *ind = suq->getIndirectR();
1614 int slot = suq->tex.r;
1615 int c, d;
1616
1617 for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1618 if (c >= arg || !(mask & 1))
1619 continue;
1620
1621 int offset;
1622
1623 if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1624 offset = NVC0_SU_INFO_SIZE(2);
1625 } else {
1626 offset = NVC0_SU_INFO_SIZE(c);
1627 }
1628 bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset));
1629 if (c == 2 && suq->tex.target.isCube())
1630 bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1631 bld.loadImm(NULL, 6));
1632 }
1633
1634 if (mask & 1) {
1635 if (suq->tex.target.isMS()) {
1636 Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0));
1637 Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1));
1638 Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1639 bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1640 } else {
1641 bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1642 }
1643 }
1644
1645 bld.remove(suq);
1646 return true;
1647 }
1648
1649 void
adjustCoordinatesMS(TexInstruction * tex)1650 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1651 {
1652 const int arg = tex->tex.target.getArgCount();
1653 int slot = tex->tex.r;
1654
1655 if (tex->tex.target == TEX_TARGET_2D_MS)
1656 tex->tex.target = TEX_TARGET_2D;
1657 else
1658 if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1659 tex->tex.target = TEX_TARGET_2D_ARRAY;
1660 else
1661 return;
1662
1663 Value *x = tex->getSrc(0);
1664 Value *y = tex->getSrc(1);
1665 Value *s = tex->getSrc(arg - 1);
1666
1667 Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1668 Value *ind = tex->getIndirectR();
1669
1670 Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0));
1671 Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1));
1672
1673 bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1674 bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1675
1676 s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1677 s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1678
1679 Value *dx = loadMsInfo32(ts, 0x0);
1680 Value *dy = loadMsInfo32(ts, 0x4);
1681
1682 bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1683 bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1684
1685 tex->setSrc(0, tx);
1686 tex->setSrc(1, ty);
1687 tex->moveSources(arg, -1);
1688 }
1689
1690 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1691 // They're computed from the coordinates using the surface info in c[] space.
1692 void
processSurfaceCoordsNVE4(TexInstruction * su)1693 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1694 {
1695 Instruction *insn;
1696 const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1697 const bool raw =
1698 su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1699 const int slot = su->tex.r;
1700 const int dim = su->tex.target.getDim();
1701 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1702 int c;
1703 Value *zero = bld.mkImm(0);
1704 Value *p1 = NULL;
1705 Value *v;
1706 Value *src[3];
1707 Value *bf, *eau, *off;
1708 Value *addr, *pred;
1709 Value *ind = su->getIndirectR();
1710
1711 off = bld.getScratch(4);
1712 bf = bld.getScratch(4);
1713 addr = bld.getSSA(8);
1714 pred = bld.getScratch(1, FILE_PREDICATE);
1715
1716 bld.setPosition(su, false);
1717
1718 adjustCoordinatesMS(su);
1719
1720 // calculate clamped coordinates
1721 for (c = 0; c < arg; ++c) {
1722 int dimc = c;
1723
1724 if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
1725 // The array index is stored in the Z component for 1D arrays.
1726 dimc = 2;
1727 }
1728
1729 src[c] = bld.getScratch();
1730 if (c == 0 && raw)
1731 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X);
1732 else
1733 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc));
1734 bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
1735 ->subOp = getSuClampSubOp(su, dimc);
1736 }
1737 for (; c < 3; ++c)
1738 src[c] = zero;
1739
1740 // set predicate output
1741 if (su->tex.target == TEX_TARGET_BUFFER) {
1742 src[0]->getInsn()->setFlagsDef(1, pred);
1743 } else
1744 if (su->tex.target.isArray() || su->tex.target.isCube()) {
1745 p1 = bld.getSSA(1, FILE_PREDICATE);
1746 src[dim]->getInsn()->setFlagsDef(1, p1);
1747 }
1748
1749 // calculate pixel offset
1750 if (dim == 1) {
1751 if (su->tex.target != TEX_TARGET_BUFFER)
1752 bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
1753 } else
1754 if (dim == 3) {
1755 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C);
1756 bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
1757 ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1758
1759 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH);
1760 bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
1761 ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1762 } else {
1763 assert(dim == 2);
1764 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH);
1765 bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
1766 ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
1767 NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1768 }
1769
1770 // calculate effective address part 1
1771 if (su->tex.target == TEX_TARGET_BUFFER) {
1772 if (raw) {
1773 bf = src[0];
1774 } else {
1775 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT);
1776 bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
1777 ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
1778 }
1779 } else {
1780 Value *y = src[1];
1781 Value *z = src[2];
1782 uint16_t subOp = 0;
1783
1784 switch (dim) {
1785 case 1:
1786 y = zero;
1787 z = zero;
1788 break;
1789 case 2:
1790 z = off;
1791 if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
1792 z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C);
1793 subOp = NV50_IR_SUBOP_SUBFM_3D;
1794 }
1795 break;
1796 default:
1797 subOp = NV50_IR_SUBOP_SUBFM_3D;
1798 assert(dim == 3);
1799 break;
1800 }
1801 insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
1802 insn->subOp = subOp;
1803 insn->setFlagsDef(1, pred);
1804 }
1805
1806 // part 2
1807 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR);
1808
1809 if (su->tex.target == TEX_TARGET_BUFFER) {
1810 eau = v;
1811 } else {
1812 eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
1813 }
1814 // add array layer offset
1815 if (su->tex.target.isArray() || su->tex.target.isCube()) {
1816 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY);
1817 if (dim == 1)
1818 bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
1819 ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1820 else
1821 bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
1822 ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1823 // combine predicates
1824 assert(p1);
1825 bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
1826 }
1827
1828 if (atom) {
1829 Value *lo = bf;
1830 if (su->tex.target == TEX_TARGET_BUFFER) {
1831 lo = zero;
1832 bld.mkMov(off, bf);
1833 }
1834 // bf == g[] address & 0xff
1835 // eau == g[] address >> 8
1836 bld.mkOp3(OP_PERMT, TYPE_U32, bf, lo, bld.loadImm(NULL, 0x6540), eau);
1837 bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
1838 } else
1839 if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
1840 // Convert from u32 to u8 address format, which is what the library code
1841 // doing SULDP currently uses.
1842 // XXX: can SUEAU do this ?
1843 // XXX: does it matter that we don't mask high bytes in bf ?
1844 // Grrr.
1845 bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
1846 bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
1847 }
1848
1849 bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
1850
1851 if (atom && su->tex.target == TEX_TARGET_BUFFER)
1852 bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
1853
1854 // let's just set it 0 for raw access and hope it works
1855 v = raw ?
1856 bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT);
1857
1858 // get rid of old coordinate sources, make space for fmt info and predicate
1859 su->moveSources(arg, 3 - arg);
1860 // set 64 bit address and 32-bit format sources
1861 su->setSrc(0, addr);
1862 su->setSrc(1, v);
1863 su->setSrc(2, pred);
1864
1865 // prevent read fault when the image is not actually bound
1866 CmpInstruction *pred1 =
1867 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1868 TYPE_U32, bld.mkImm(0),
1869 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
1870
1871 if (su->op != OP_SUSTP && su->tex.format) {
1872 const TexInstruction::ImgFormatDesc *format = su->tex.format;
1873 int blockwidth = format->bits[0] + format->bits[1] +
1874 format->bits[2] + format->bits[3];
1875
1876 // make sure that the format doesn't mismatch
1877 assert(format->components != 0);
1878 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
1879 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
1880 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
1881 pred1->getDef(0));
1882 }
1883 su->setPredicate(CC_NOT_P, pred1->getDef(0));
1884
1885 // TODO: initialize def values to 0 when the surface operation is not
1886 // performed (not needed for stores). Also, fix the "address bounds test"
1887 // subtests from arb_shader_image_load_store-invalid for buffers, because it
1888 // seems like that the predicate is not correctly set by suclamp.
1889 }
1890
1891 static DataType
getSrcType(const TexInstruction::ImgFormatDesc * t,int c)1892 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
1893 {
1894 switch (t->type) {
1895 case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
1896 case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
1897 case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
1898 case UINT:
1899 return (t->bits[c] == 8 ? TYPE_U8 :
1900 (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
1901 case SINT:
1902 return (t->bits[c] == 8 ? TYPE_S8 :
1903 (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
1904 }
1905 return TYPE_NONE;
1906 }
1907
1908 static DataType
getDestType(const ImgType type)1909 getDestType(const ImgType type) {
1910 switch (type) {
1911 case FLOAT:
1912 case UNORM:
1913 case SNORM:
1914 return TYPE_F32;
1915 case UINT:
1916 return TYPE_U32;
1917 case SINT:
1918 return TYPE_S32;
1919 default:
1920 assert(!"Impossible type");
1921 return TYPE_NONE;
1922 }
1923 }
1924
1925 void
convertSurfaceFormat(TexInstruction * su)1926 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
1927 {
1928 const TexInstruction::ImgFormatDesc *format = su->tex.format;
1929 int width = format->bits[0] + format->bits[1] +
1930 format->bits[2] + format->bits[3];
1931 Value *untypedDst[4] = {};
1932 Value *typedDst[4] = {};
1933
1934 // We must convert this to a generic load.
1935 su->op = OP_SULDB;
1936
1937 su->dType = typeOfSize(width / 8);
1938 su->sType = TYPE_U8;
1939
1940 for (int i = 0; i < width / 32; i++)
1941 untypedDst[i] = bld.getSSA();
1942 if (width < 32)
1943 untypedDst[0] = bld.getSSA();
1944
1945 for (int i = 0; i < 4; i++) {
1946 typedDst[i] = su->getDef(i);
1947 }
1948
1949 // Set the untyped dsts as the su's destinations
1950 for (int i = 0; i < 4; i++)
1951 su->setDef(i, untypedDst[i]);
1952
1953 bld.setPosition(su, true);
1954
1955 // Unpack each component into the typed dsts
1956 int bits = 0;
1957 for (int i = 0; i < 4; bits += format->bits[i], i++) {
1958 if (!typedDst[i])
1959 continue;
1960 if (i >= format->components) {
1961 if (format->type == FLOAT ||
1962 format->type == UNORM ||
1963 format->type == SNORM)
1964 bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
1965 else
1966 bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
1967 continue;
1968 }
1969
1970 // Get just that component's data into the relevant place
1971 if (format->bits[i] == 32)
1972 bld.mkMov(typedDst[i], untypedDst[i]);
1973 else if (format->bits[i] == 16)
1974 bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
1975 getSrcType(format, i), untypedDst[i / 2])
1976 ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
1977 else if (format->bits[i] == 8)
1978 bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
1979 getSrcType(format, i), untypedDst[0])->subOp = i;
1980 else {
1981 bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
1982 bld.mkImm((bits % 32) | (format->bits[i] << 8)));
1983 if (format->type == UNORM || format->type == SNORM)
1984 bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
1985 }
1986
1987 // Normalize / convert as necessary
1988 if (format->type == UNORM)
1989 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
1990 else if (format->type == SNORM)
1991 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
1992 else if (format->type == FLOAT && format->bits[i] < 16) {
1993 bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
1994 bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
1995 }
1996 }
1997
1998 if (format->bgra) {
1999 std::swap(typedDst[0], typedDst[2]);
2000 }
2001 }
2002
2003 void
handleSurfaceOpNVE4(TexInstruction * su)2004 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2005 {
2006 processSurfaceCoordsNVE4(su);
2007
2008 if (su->op == OP_SULDP)
2009 convertSurfaceFormat(su);
2010
2011 if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2012 assert(su->getPredicate());
2013 Value *pred =
2014 bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2015 su->getPredicate(), su->getSrc(2));
2016
2017 Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2018 red->subOp = su->subOp;
2019 red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2020 red->setSrc(1, su->getSrc(3));
2021 if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2022 red->setSrc(2, su->getSrc(4));
2023 red->setIndirect(0, 0, su->getSrc(0));
2024
2025 // make sure to initialize dst value when the atomic operation is not
2026 // performed
2027 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2028
2029 assert(su->cc == CC_NOT_P);
2030 red->setPredicate(su->cc, pred);
2031 mov->setPredicate(CC_P, pred);
2032
2033 bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2034 red->getDef(0), mov->getDef(0));
2035
2036 delete_Instruction(bld.getProgram(), su);
2037 handleCasExch(red, true);
2038 }
2039
2040 if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2041 su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2042 }
2043
2044 void
processSurfaceCoordsNVC0(TexInstruction * su)2045 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2046 {
2047 const int slot = su->tex.r;
2048 const int dim = su->tex.target.getDim();
2049 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2050 int c;
2051 Value *zero = bld.mkImm(0);
2052 Value *src[3];
2053 Value *v;
2054 Value *ind = su->getIndirectR();
2055
2056 bld.setPosition(su, false);
2057
2058 adjustCoordinatesMS(su);
2059
2060 if (ind) {
2061 Value *ptr;
2062 ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2063 ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2064 su->setIndirectR(ptr);
2065 }
2066
2067 // get surface coordinates
2068 for (c = 0; c < arg; ++c)
2069 src[c] = su->getSrc(c);
2070 for (; c < 3; ++c)
2071 src[c] = zero;
2072
2073 // calculate pixel offset
2074 if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2075 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE);
2076 su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
2077 }
2078
2079 // add array layer offset
2080 if (su->tex.target.isArray() || su->tex.target.isCube()) {
2081 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY);
2082 assert(dim > 1);
2083 su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
2084 }
2085
2086 // prevent read fault when the image is not actually bound
2087 CmpInstruction *pred =
2088 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2089 TYPE_U32, bld.mkImm(0),
2090 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
2091 if (su->op != OP_SUSTP && su->tex.format) {
2092 const TexInstruction::ImgFormatDesc *format = su->tex.format;
2093 int blockwidth = format->bits[0] + format->bits[1] +
2094 format->bits[2] + format->bits[3];
2095
2096 assert(format->components != 0);
2097 // make sure that the format doesn't mismatch when it's not FMT_NONE
2098 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2099 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2100 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
2101 pred->getDef(0));
2102 }
2103 su->setPredicate(CC_NOT_P, pred->getDef(0));
2104 }
2105
2106 void
handleSurfaceOpNVC0(TexInstruction * su)2107 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2108 {
2109 if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2110 /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2111 * will simplify the lowering pass and the texture constraints. */
2112 su->moveSources(1, 1);
2113 su->setSrc(1, bld.loadImm(NULL, 0));
2114 su->tex.target = TEX_TARGET_2D_ARRAY;
2115 }
2116
2117 processSurfaceCoordsNVC0(su);
2118
2119 if (su->op == OP_SULDP)
2120 convertSurfaceFormat(su);
2121
2122 if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2123 const int dim = su->tex.target.getDim();
2124 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2125 LValue *addr = bld.getSSA(8);
2126 Value *def = su->getDef(0);
2127
2128 su->op = OP_SULEA;
2129
2130 // Set the destination to the address
2131 su->dType = TYPE_U64;
2132 su->setDef(0, addr);
2133 su->setDef(1, su->getPredicate());
2134
2135 bld.setPosition(su, true);
2136
2137 // Perform the atomic op
2138 Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2139 red->subOp = su->subOp;
2140 red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2141 red->setSrc(1, su->getSrc(arg));
2142 if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2143 red->setSrc(2, su->getSrc(arg + 1));
2144 red->setIndirect(0, 0, addr);
2145
2146 // make sure to initialize dst value when the atomic operation is not
2147 // performed
2148 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2149
2150 assert(su->cc == CC_NOT_P);
2151 red->setPredicate(su->cc, su->getPredicate());
2152 mov->setPredicate(CC_P, su->getPredicate());
2153
2154 bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2155
2156 handleCasExch(red, false);
2157 }
2158 }
2159
2160 void
processSurfaceCoordsGM107(TexInstruction * su)2161 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
2162 {
2163 const int slot = su->tex.r;
2164 const int dim = su->tex.target.getDim();
2165 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2166 Value *ind = su->getIndirectR();
2167 int pos = 0;
2168
2169 bld.setPosition(su, false);
2170
2171 // add texture handle
2172 switch (su->op) {
2173 case OP_SUSTP:
2174 pos = 4;
2175 break;
2176 case OP_SUREDP:
2177 pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2178 break;
2179 default:
2180 assert(pos == 0);
2181 break;
2182 }
2183 su->setSrc(arg + pos, loadTexHandle(ind, slot + 32));
2184
2185 // prevent read fault when the image is not actually bound
2186 CmpInstruction *pred =
2187 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2188 TYPE_U32, bld.mkImm(0),
2189 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
2190 if (su->op != OP_SUSTP && su->tex.format) {
2191 const TexInstruction::ImgFormatDesc *format = su->tex.format;
2192 int blockwidth = format->bits[0] + format->bits[1] +
2193 format->bits[2] + format->bits[3];
2194
2195 assert(format->components != 0);
2196 // make sure that the format doesn't mismatch when it's not FMT_NONE
2197 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2198 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2199 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
2200 pred->getDef(0));
2201 }
2202 su->setPredicate(CC_NOT_P, pred->getDef(0));
2203 }
2204
2205 void
handleSurfaceOpGM107(TexInstruction * su)2206 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2207 {
2208 processSurfaceCoordsGM107(su);
2209
2210 if (su->op == OP_SULDP)
2211 convertSurfaceFormat(su);
2212
2213 if (su->op == OP_SUREDP) {
2214 Value *def = su->getDef(0);
2215
2216 su->op = OP_SUREDB;
2217 su->setDef(0, bld.getSSA());
2218
2219 bld.setPosition(su, true);
2220
2221 // make sure to initialize dst value when the atomic operation is not
2222 // performed
2223 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2224
2225 assert(su->cc == CC_NOT_P);
2226 mov->setPredicate(CC_P, su->getPredicate());
2227
2228 bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
2229 }
2230 }
2231
2232 bool
handleWRSV(Instruction * i)2233 NVC0LoweringPass::handleWRSV(Instruction *i)
2234 {
2235 Instruction *st;
2236 Symbol *sym;
2237 uint32_t addr;
2238
2239 // must replace, $sreg are not writeable
2240 addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2241 if (addr >= 0x400)
2242 return false;
2243 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2244
2245 st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2246 i->getSrc(1));
2247 st->perPatch = i->perPatch;
2248
2249 bld.getBB()->remove(i);
2250 return true;
2251 }
2252
2253 void
handleLDST(Instruction * i)2254 NVC0LoweringPass::handleLDST(Instruction *i)
2255 {
2256 if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2257 if (prog->getType() == Program::TYPE_COMPUTE) {
2258 i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2259 i->getSrc(0)->reg.fileIndex = 0;
2260 } else
2261 if (prog->getType() == Program::TYPE_GEOMETRY &&
2262 i->src(0).isIndirect(0)) {
2263 // XXX: this assumes vec4 units
2264 Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2265 i->getIndirect(0, 0), bld.mkImm(4));
2266 i->setIndirect(0, 0, ptr);
2267 i->op = OP_VFETCH;
2268 } else {
2269 i->op = OP_VFETCH;
2270 assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2271 }
2272 } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2273 if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2274 prog->getType() == Program::TYPE_COMPUTE) {
2275 // The launch descriptor only allows to set up 8 CBs, but OpenGL
2276 // requires at least 12 UBOs. To bypass this limitation, we store the
2277 // addrs into the driver constbuf and we directly load from the global
2278 // memory.
2279 int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2280 Value *ind = i->getIndirect(0, 1);
2281
2282 if (!ind && fileIndex == -1)
2283 return;
2284
2285 if (ind) {
2286 // Clamp the UBO index when an indirect access is used to avoid
2287 // loading information from the wrong place in the driver cb.
2288 // TODO - synchronize the max with the driver.
2289 ind = bld.mkOp2v(OP_MIN, TYPE_U32, ind,
2290 bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2291 ind, bld.loadImm(NULL, fileIndex)),
2292 bld.loadImm(NULL, 13));
2293 fileIndex = 0;
2294 }
2295
2296 Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2297 Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2298 Value *length = loadUboLength32(ind, fileIndex * 16);
2299 Value *pred = new_LValue(func, FILE_PREDICATE);
2300 if (i->src(0).isIndirect(0)) {
2301 bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2302 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2303 }
2304 i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2305 i->setIndirect(0, 1, NULL);
2306 i->setIndirect(0, 0, ptr);
2307 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2308 i->setPredicate(CC_NOT_P, pred);
2309 Value *zero, *dst = i->getDef(0);
2310 i->setDef(0, bld.getSSA());
2311
2312 bld.setPosition(i, true);
2313 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2314 ->setPredicate(CC_P, pred);
2315 bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2316 } else if (i->src(0).isIndirect(1)) {
2317 Value *ptr;
2318 if (i->src(0).isIndirect(0))
2319 ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2320 i->getIndirect(0, 1), bld.mkImm(0x1010),
2321 i->getIndirect(0, 0));
2322 else
2323 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2324 i->getIndirect(0, 1), bld.mkImm(16));
2325 i->setIndirect(0, 1, NULL);
2326 i->setIndirect(0, 0, ptr);
2327 i->subOp = NV50_IR_SUBOP_LDC_IS;
2328 }
2329 } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2330 assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2331 i->op = OP_VFETCH;
2332 } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2333 Value *ind = i->getIndirect(0, 1);
2334 Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2335 // XXX come up with a way not to do this for EVERY little access but
2336 // rather to batch these up somehow. Unfortunately we've lost the
2337 // information about the field width by the time we get here.
2338 Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2339 Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2340 Value *pred = new_LValue(func, FILE_PREDICATE);
2341 if (i->src(0).isIndirect(0)) {
2342 bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2343 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2344 }
2345 i->setIndirect(0, 1, NULL);
2346 i->setIndirect(0, 0, ptr);
2347 i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2348 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2349 i->setPredicate(CC_NOT_P, pred);
2350 if (i->defExists(0)) {
2351 Value *zero, *dst = i->getDef(0);
2352 i->setDef(0, bld.getSSA());
2353
2354 bld.setPosition(i, true);
2355 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2356 ->setPredicate(CC_P, pred);
2357 bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2358 }
2359 }
2360 }
2361
2362 void
readTessCoord(LValue * dst,int c)2363 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2364 {
2365 Value *laneid = bld.getSSA();
2366 Value *x, *y;
2367
2368 bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2369
2370 if (c == 0) {
2371 x = dst;
2372 y = NULL;
2373 } else
2374 if (c == 1) {
2375 x = NULL;
2376 y = dst;
2377 } else {
2378 assert(c == 2);
2379 if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2380 bld.mkMov(dst, bld.loadImm(NULL, 0));
2381 return;
2382 }
2383 x = bld.getSSA();
2384 y = bld.getSSA();
2385 }
2386 if (x)
2387 bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2388 if (y)
2389 bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2390
2391 if (c == 2) {
2392 bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2393 bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2394 }
2395 }
2396
2397 bool
handleRDSV(Instruction * i)2398 NVC0LoweringPass::handleRDSV(Instruction *i)
2399 {
2400 Symbol *sym = i->getSrc(0)->asSym();
2401 const SVSemantic sv = sym->reg.data.sv.sv;
2402 Value *vtx = NULL;
2403 Instruction *ld;
2404 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2405
2406 if (addr >= 0x400) {
2407 // mov $sreg
2408 if (sym->reg.data.sv.index == 3) {
2409 // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2410 i->op = OP_MOV;
2411 i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2412 }
2413 if (sv == SV_VERTEX_COUNT) {
2414 bld.setPosition(i, true);
2415 bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2416 }
2417 return true;
2418 }
2419
2420 switch (sv) {
2421 case SV_POSITION:
2422 assert(prog->getType() == Program::TYPE_FRAGMENT);
2423 if (i->srcExists(1)) {
2424 // Pass offset through to the interpolation logic
2425 ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2426 i->getDef(0), addr, NULL);
2427 ld->setSrc(1, i->getSrc(1));
2428 } else {
2429 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2430 }
2431 break;
2432 case SV_FACE:
2433 {
2434 Value *face = i->getDef(0);
2435 bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2436 if (i->dType == TYPE_F32) {
2437 bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2438 bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2439 bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2440 }
2441 }
2442 break;
2443 case SV_TESS_COORD:
2444 assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2445 readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2446 break;
2447 case SV_NTID:
2448 case SV_NCTAID:
2449 case SV_GRIDID:
2450 assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2451 if (sym->reg.data.sv.index == 3) {
2452 i->op = OP_MOV;
2453 i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
2454 return true;
2455 }
2456 // Fallthrough
2457 case SV_WORK_DIM:
2458 addr += prog->driver->prop.cp.gridInfoBase;
2459 bld.mkLoad(TYPE_U32, i->getDef(0),
2460 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2461 TYPE_U32, addr), NULL);
2462 break;
2463 case SV_SAMPLE_INDEX:
2464 // TODO: Properly pass source as an address in the PIX address space
2465 // (which can be of the form [r0+offset]). But this is currently
2466 // unnecessary.
2467 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2468 ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2469 break;
2470 case SV_SAMPLE_POS: {
2471 Value *off = new_LValue(func, FILE_GPR);
2472 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2473 ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2474 bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
2475 bld.mkLoad(TYPE_F32,
2476 i->getDef(0),
2477 bld.mkSymbol(
2478 FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2479 TYPE_U32, prog->driver->io.sampleInfoBase +
2480 4 * sym->reg.data.sv.index),
2481 off);
2482 break;
2483 }
2484 case SV_SAMPLE_MASK: {
2485 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2486 ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
2487 Instruction *sampleid =
2488 bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2489 sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2490 Value *masked =
2491 bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
2492 bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2493 bld.loadImm(NULL, 1), sampleid->getDef(0)));
2494 if (prog->driver->prop.fp.persampleInvocation) {
2495 bld.mkMov(i->getDef(0), masked);
2496 } else {
2497 bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
2498 bld.mkImm(0))
2499 ->subOp = 1;
2500 }
2501 break;
2502 }
2503 case SV_BASEVERTEX:
2504 case SV_BASEINSTANCE:
2505 case SV_DRAWID:
2506 ld = bld.mkLoad(TYPE_U32, i->getDef(0),
2507 bld.mkSymbol(FILE_MEMORY_CONST,
2508 prog->driver->io.auxCBSlot,
2509 TYPE_U32,
2510 prog->driver->io.drawInfoBase +
2511 4 * (sv - SV_BASEVERTEX)),
2512 NULL);
2513 break;
2514 default:
2515 if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
2516 vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2517 if (prog->getType() == Program::TYPE_FRAGMENT) {
2518 bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
2519 } else {
2520 ld = bld.mkFetch(i->getDef(0), i->dType,
2521 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
2522 ld->perPatch = i->perPatch;
2523 }
2524 break;
2525 }
2526 bld.getBB()->remove(i);
2527 return true;
2528 }
2529
2530 bool
handleDIV(Instruction * i)2531 NVC0LoweringPass::handleDIV(Instruction *i)
2532 {
2533 if (!isFloatType(i->dType))
2534 return true;
2535 bld.setPosition(i, false);
2536 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
2537 i->op = OP_MUL;
2538 i->setSrc(1, rcp->getDef(0));
2539 return true;
2540 }
2541
2542 bool
handleMOD(Instruction * i)2543 NVC0LoweringPass::handleMOD(Instruction *i)
2544 {
2545 if (!isFloatType(i->dType))
2546 return true;
2547 LValue *value = bld.getScratch(typeSizeof(i->dType));
2548 bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
2549 bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
2550 bld.mkOp1(OP_TRUNC, i->dType, value, value);
2551 bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
2552 i->op = OP_SUB;
2553 i->setSrc(1, value);
2554 return true;
2555 }
2556
2557 bool
handleSQRT(Instruction * i)2558 NVC0LoweringPass::handleSQRT(Instruction *i)
2559 {
2560 if (i->dType == TYPE_F64) {
2561 Value *pred = bld.getSSA(1, FILE_PREDICATE);
2562 Value *zero = bld.loadImm(NULL, 0.0);
2563 Value *dst = bld.getSSA(8);
2564 bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
2565 bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
2566 bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
2567 i->op = OP_MUL;
2568 i->setSrc(1, dst);
2569 // TODO: Handle this properly with a library function
2570 } else {
2571 bld.setPosition(i, true);
2572 i->op = OP_RSQ;
2573 bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
2574 }
2575
2576 return true;
2577 }
2578
2579 bool
handlePOW(Instruction * i)2580 NVC0LoweringPass::handlePOW(Instruction *i)
2581 {
2582 LValue *val = bld.getScratch();
2583
2584 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
2585 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
2586 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
2587
2588 i->op = OP_EX2;
2589 i->setSrc(0, val);
2590 i->setSrc(1, NULL);
2591
2592 return true;
2593 }
2594
2595 bool
handleEXPORT(Instruction * i)2596 NVC0LoweringPass::handleEXPORT(Instruction *i)
2597 {
2598 if (prog->getType() == Program::TYPE_FRAGMENT) {
2599 int id = i->getSrc(0)->reg.data.offset / 4;
2600
2601 if (i->src(0).isIndirect(0)) // TODO, ugly
2602 return false;
2603 i->op = OP_MOV;
2604 i->subOp = NV50_IR_SUBOP_MOV_FINAL;
2605 i->src(0).set(i->src(1));
2606 i->setSrc(1, NULL);
2607 i->setDef(0, new_LValue(func, FILE_GPR));
2608 i->getDef(0)->reg.data.id = id;
2609
2610 prog->maxGPR = MAX2(prog->maxGPR, id);
2611 } else
2612 if (prog->getType() == Program::TYPE_GEOMETRY) {
2613 i->setIndirect(0, 1, gpEmitAddress);
2614 }
2615 return true;
2616 }
2617
2618 bool
handleOUT(Instruction * i)2619 NVC0LoweringPass::handleOUT(Instruction *i)
2620 {
2621 Instruction *prev = i->prev;
2622 ImmediateValue stream, prevStream;
2623
2624 // Only merge if the stream ids match. Also, note that the previous
2625 // instruction would have already been lowered, so we take arg1 from it.
2626 if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
2627 i->src(0).getImmediate(stream) &&
2628 prev->src(1).getImmediate(prevStream) &&
2629 stream.reg.data.u32 == prevStream.reg.data.u32) {
2630 i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
2631 delete_Instruction(prog, i);
2632 } else {
2633 assert(gpEmitAddress);
2634 i->setDef(0, gpEmitAddress);
2635 i->setSrc(1, i->getSrc(0));
2636 i->setSrc(0, gpEmitAddress);
2637 }
2638 return true;
2639 }
2640
2641 // Generate a binary predicate if an instruction is predicated by
2642 // e.g. an f32 value.
2643 void
checkPredicate(Instruction * insn)2644 NVC0LoweringPass::checkPredicate(Instruction *insn)
2645 {
2646 Value *pred = insn->getPredicate();
2647 Value *pdst;
2648
2649 if (!pred || pred->reg.file == FILE_PREDICATE)
2650 return;
2651 pdst = new_LValue(func, FILE_PREDICATE);
2652
2653 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
2654 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
2655
2656 bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
2657
2658 insn->setPredicate(insn->cc, pdst);
2659 }
2660
2661 //
2662 // - add quadop dance for texturing
2663 // - put FP outputs in GPRs
2664 // - convert instruction sequences
2665 //
2666 bool
visit(Instruction * i)2667 NVC0LoweringPass::visit(Instruction *i)
2668 {
2669 bool ret = true;
2670 bld.setPosition(i, false);
2671
2672 if (i->cc != CC_ALWAYS)
2673 checkPredicate(i);
2674
2675 switch (i->op) {
2676 case OP_TEX:
2677 case OP_TXB:
2678 case OP_TXL:
2679 case OP_TXF:
2680 case OP_TXG:
2681 return handleTEX(i->asTex());
2682 case OP_TXD:
2683 return handleTXD(i->asTex());
2684 case OP_TXLQ:
2685 return handleTXLQ(i->asTex());
2686 case OP_TXQ:
2687 return handleTXQ(i->asTex());
2688 case OP_EX2:
2689 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
2690 i->setSrc(0, i->getDef(0));
2691 break;
2692 case OP_POW:
2693 return handlePOW(i);
2694 case OP_DIV:
2695 return handleDIV(i);
2696 case OP_MOD:
2697 return handleMOD(i);
2698 case OP_SQRT:
2699 return handleSQRT(i);
2700 case OP_EXPORT:
2701 ret = handleEXPORT(i);
2702 break;
2703 case OP_EMIT:
2704 case OP_RESTART:
2705 return handleOUT(i);
2706 case OP_RDSV:
2707 return handleRDSV(i);
2708 case OP_WRSV:
2709 return handleWRSV(i);
2710 case OP_STORE:
2711 case OP_LOAD:
2712 handleLDST(i);
2713 break;
2714 case OP_ATOM:
2715 {
2716 const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
2717 handleATOM(i);
2718 handleCasExch(i, cctl);
2719 }
2720 break;
2721 case OP_SULDB:
2722 case OP_SULDP:
2723 case OP_SUSTB:
2724 case OP_SUSTP:
2725 case OP_SUREDB:
2726 case OP_SUREDP:
2727 if (targ->getChipset() >= NVISA_GM107_CHIPSET)
2728 handleSurfaceOpGM107(i->asTex());
2729 else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
2730 handleSurfaceOpNVE4(i->asTex());
2731 else
2732 handleSurfaceOpNVC0(i->asTex());
2733 break;
2734 case OP_SUQ:
2735 handleSUQ(i->asTex());
2736 break;
2737 case OP_BUFQ:
2738 handleBUFQ(i);
2739 break;
2740 default:
2741 break;
2742 }
2743
2744 /* Kepler+ has a special opcode to compute a new base address to be used
2745 * for indirect loads.
2746 *
2747 * Maxwell+ has an additional similar requirement for indirect
2748 * interpolation ops in frag shaders.
2749 */
2750 bool doAfetch = false;
2751 if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2752 !i->perPatch &&
2753 (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
2754 i->src(0).isIndirect(0)) {
2755 doAfetch = true;
2756 }
2757 if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
2758 (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
2759 i->src(0).isIndirect(0)) {
2760 doAfetch = true;
2761 }
2762
2763 if (doAfetch) {
2764 Value *addr = cloneShallow(func, i->getSrc(0));
2765 Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
2766 i->getSrc(0));
2767 afetch->setIndirect(0, 0, i->getIndirect(0, 0));
2768 addr->reg.data.offset = 0;
2769 i->setSrc(0, addr);
2770 i->setIndirect(0, 0, afetch->getDef(0));
2771 }
2772
2773 return ret;
2774 }
2775
2776 bool
runLegalizePass(Program * prog,CGStage stage) const2777 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
2778 {
2779 if (stage == CG_STAGE_PRE_SSA) {
2780 NVC0LoweringPass pass(prog);
2781 return pass.run(prog, false, true);
2782 } else
2783 if (stage == CG_STAGE_POST_RA) {
2784 NVC0LegalizePostRA pass(prog);
2785 return pass.run(prog, false, true);
2786 } else
2787 if (stage == CG_STAGE_SSA) {
2788 NVC0LegalizeSSA pass;
2789 return pass.run(prog, false, true);
2790 }
2791 return false;
2792 }
2793
2794 } // namespace nv50_ir
2795