• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2011 Christoph Bumiller
3  *           2014 Red Hat Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "codegen/nv50_ir.h"
25 #include "codegen/nv50_ir_build_util.h"
26 
27 #include "codegen/nv50_ir_target_nvc0.h"
28 #include "codegen/nv50_ir_lowering_gm107.h"
29 
30 #include <limits>
31 
32 namespace nv50_ir {
33 
34 #define QOP_ADD  0
35 #define QOP_SUBR 1
36 #define QOP_SUB  2
37 #define QOP_MOV2 3
38 
39 //             UL UR LL LR
40 #define QUADOP(q, r, s, t)                      \
41    ((QOP_##q << 6) | (QOP_##r << 4) |           \
42     (QOP_##s << 2) | (QOP_##t << 0))
43 
44 #define SHFL_BOUND_QUAD 0x1c03
45 
46 void
handlePFETCH(Instruction * i)47 GM107LegalizeSSA::handlePFETCH(Instruction *i)
48 {
49    Value *src0;
50 
51    if (i->src(0).getFile() == FILE_GPR && !i->srcExists(1))
52       return;
53 
54    bld.setPosition(i, false);
55    src0 = bld.getSSA();
56 
57    if (i->srcExists(1))
58       bld.mkOp2(OP_ADD , TYPE_U32, src0, i->getSrc(0), i->getSrc(1));
59    else
60       bld.mkOp1(OP_MOV , TYPE_U32, src0, i->getSrc(0));
61 
62    i->setSrc(0, src0);
63    i->setSrc(1, NULL);
64 }
65 
66 void
handleLOAD(Instruction * i)67 GM107LegalizeSSA::handleLOAD(Instruction *i)
68 {
69    if (i->src(0).getFile() != FILE_MEMORY_CONST)
70       return;
71    if (i->src(0).isIndirect(0))
72       return;
73    if (typeSizeof(i->dType) != 4)
74       return;
75 
76    i->op = OP_MOV;
77 }
78 
79 void
handleQUADON(Instruction * i)80 GM107LegalizeSSA::handleQUADON(Instruction *i)
81 {
82    i->setDef(0, NULL);
83 }
84 
85 void
handleQUADPOP(Instruction * i)86 GM107LegalizeSSA::handleQUADPOP(Instruction *i)
87 {
88    i->setSrc(0, NULL);
89 }
90 
91 bool
visit(Instruction * i)92 GM107LegalizeSSA::visit(Instruction *i)
93 {
94    switch (i->op) {
95    case OP_QUADON:
96       handleQUADON(i);
97       break;
98    case OP_QUADPOP:
99       handleQUADPOP(i);
100       break;
101    case OP_PFETCH:
102       handlePFETCH(i);
103       break;
104    case OP_LOAD:
105       handleLOAD(i);
106       break;
107    default:
108       break;
109    }
110    return true;
111 }
112 
113 bool
handleManualTXD(TexInstruction * i)114 GM107LoweringPass::handleManualTXD(TexInstruction *i)
115 {
116    // See NVC0LoweringPass::handleManualTXD for rationale. This function
117    // implements the same logic, but using SM50-friendly primitives.
118    static const uint8_t qOps[2] =
119       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
120    Value *def[4][4];
121    Value *crd[3], *arr, *shadow;
122    Value *tmp;
123    Instruction *tex, *add;
124    Value *quad = bld.mkImm(SHFL_BOUND_QUAD);
125    int l, c;
126    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
127    const int array = i->tex.target.isArray();
128    const int indirect = i->tex.rIndirectSrc >= 0;
129 
130    i->op = OP_TEX; // no need to clone dPdx/dPdy later
131 
132    for (c = 0; c < dim; ++c)
133       crd[c] = bld.getScratch();
134    arr = bld.getScratch();
135    shadow = bld.getScratch();
136    tmp = bld.getScratch();
137 
138    for (l = 0; l < 4; ++l) {
139       Value *bar = bld.getSSA(4, FILE_BARRIER);
140       Value *src[3], *val;
141       Value *lane = bld.mkImm(l);
142       bld.mkOp(OP_QUADON, TYPE_U32, bar);
143       // Make sure lane 0 has the appropriate array/depth compare values
144       if (l != 0) {
145          if (array)
146             bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad);
147          if (i->tex.target.isShadow())
148             bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim + indirect), lane, quad);
149       }
150 
151       // mov coordinates from lane l to all lanes
152       for (c = 0; c < dim; ++c) {
153          bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad);
154       }
155 
156       // add dPdx from lane l to lanes dx
157       for (c = 0; c < dim; ++c) {
158          bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad);
159          add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
160          add->subOp = qOps[0];
161          add->lanes = 1; /* abused for .ndv */
162       }
163 
164       // add dPdy from lane l to lanes dy
165       for (c = 0; c < dim; ++c) {
166          bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad);
167          add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
168          add->subOp = qOps[1];
169          add->lanes = 1; /* abused for .ndv */
170       }
171 
172       // normalize cube coordinates if necessary
173       if (i->tex.target.isCube()) {
174          for (c = 0; c < 3; ++c)
175             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
176          val = bld.getScratch();
177          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
178          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
179          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
180          for (c = 0; c < 3; ++c)
181             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
182       } else {
183          for (c = 0; c < dim; ++c)
184             src[c] = crd[c];
185       }
186 
187       // texture
188       bld.insert(tex = cloneForward(func, i));
189       if (l != 0) {
190          if (array)
191             tex->setSrc(0, arr);
192          if (i->tex.target.isShadow())
193             tex->setSrc(array + dim + indirect, shadow);
194       }
195       for (c = 0; c < dim; ++c)
196          tex->setSrc(c + array, src[c]);
197       // broadcast results from lane 0 to all lanes
198       if (l != 0)
199          for (c = 0; i->defExists(c); ++c)
200             bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), bld.mkImm(0), quad);
201       bld.mkOp1(OP_QUADPOP, TYPE_U32, NULL, bar)->fixed = 1;
202 
203       // save results
204       for (c = 0; i->defExists(c); ++c) {
205          Instruction *mov;
206          def[c][l] = bld.getSSA();
207          mov = bld.mkMov(def[c][l], tex->getDef(c));
208          mov->fixed = 1;
209          mov->lanes = 1 << l;
210       }
211    }
212 
213    for (c = 0; i->defExists(c); ++c) {
214       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
215       for (l = 0; l < 4; ++l)
216          u->setSrc(l, def[c][l]);
217    }
218 
219    i->bb->remove(i);
220    return true;
221 }
222 
223 bool
handleDFDX(Instruction * insn)224 GM107LoweringPass::handleDFDX(Instruction *insn)
225 {
226    Instruction *shfl;
227    int qop = 0, xid = 0;
228 
229    switch (insn->op) {
230    case OP_DFDX:
231       qop = QUADOP(SUB, SUBR, SUB, SUBR);
232       xid = 1;
233       break;
234    case OP_DFDY:
235       qop = QUADOP(SUB, SUB, SUBR, SUBR);
236       xid = 2;
237       break;
238    default:
239       assert(!"invalid dfdx opcode");
240       break;
241    }
242 
243    shfl = bld.mkOp3(OP_SHFL, TYPE_F32, bld.getScratch(), insn->getSrc(0),
244                     bld.mkImm(xid), bld.mkImm(SHFL_BOUND_QUAD));
245    shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY;
246    insn->op = OP_QUADOP;
247    insn->subOp = qop;
248    insn->lanes = 0; /* abused for !.ndv */
249    insn->setSrc(1, insn->getSrc(0));
250    insn->setSrc(0, shfl->getDef(0));
251    return true;
252 }
253 
254 bool
handlePFETCH(Instruction * i)255 GM107LoweringPass::handlePFETCH(Instruction *i)
256 {
257    Value *tmp0 = bld.getScratch();
258    Value *tmp1 = bld.getScratch();
259    Value *tmp2 = bld.getScratch();
260    bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));
261    bld.mkOp3(OP_PERMT, TYPE_U32, tmp1, tmp0, bld.mkImm(0x4442), bld.mkImm(0));
262    bld.mkOp3(OP_PERMT, TYPE_U32, tmp0, tmp0, bld.mkImm(0x4440), bld.mkImm(0));
263    if (i->getSrc(1))
264       bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1));
265    else
266       bld.mkOp1(OP_MOV , TYPE_U32, tmp2, i->getSrc(0));
267    bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2);
268    i->setSrc(0, tmp0);
269    i->setSrc(1, NULL);
270    return true;
271 }
272 
273 bool
handlePOPCNT(Instruction * i)274 GM107LoweringPass::handlePOPCNT(Instruction *i)
275 {
276    Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(),
277                            i->getSrc(0), i->getSrc(1));
278    i->setSrc(0, tmp);
279    i->setSrc(1, NULL);
280    return true;
281 }
282 
283 bool
handleSUQ(TexInstruction * suq)284 GM107LoweringPass::handleSUQ(TexInstruction *suq)
285 {
286    Value *ind = suq->getIndirectR();
287    Value *handle;
288    const int slot = suq->tex.r;
289    const int mask = suq->tex.mask;
290 
291    if (suq->tex.bindless)
292       handle = ind;
293    else
294       handle = loadTexHandle(ind, slot + 32);
295 
296    suq->tex.r = 0xff;
297    suq->tex.s = 0x1f;
298 
299    suq->setIndirectR(NULL);
300    suq->setSrc(0, handle);
301    suq->tex.rIndirectSrc = 0;
302    suq->setSrc(1, bld.loadImm(NULL, 0));
303    suq->tex.query = TXQ_DIMS;
304    suq->op = OP_TXQ;
305 
306    // We store CUBE / CUBE_ARRAY as a 2D ARRAY. Make sure that depth gets
307    // divided by 6.
308    if (mask & 0x4 && suq->tex.target.isCube()) {
309       int d = util_bitcount(mask & 0x3);
310       bld.setPosition(suq, true);
311       bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d), suq->getDef(d),
312                 bld.loadImm(NULL, 6));
313    }
314 
315    // Samples come from a different query. If we want both samples and dims,
316    // create a second suq.
317    if (mask & 0x8) {
318       int d = util_bitcount(mask & 0x7);
319       Value *dst = suq->getDef(d);
320       TexInstruction *samples = suq;
321       assert(dst);
322 
323       if (mask != 0x8) {
324          suq->setDef(d, NULL);
325          suq->tex.mask &= 0x7;
326          samples = cloneShallow(func, suq);
327          for (int i = 0; i < d; i++)
328             samples->setDef(d, NULL);
329          samples->setDef(0, dst);
330          suq->bb->insertAfter(suq, samples);
331       }
332       samples->tex.mask = 0x4;
333       samples->tex.query = TXQ_TYPE;
334    }
335 
336    if (suq->tex.target.isMS()) {
337       bld.setPosition(suq, true);
338 
339       if (mask & 0x1)
340          bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0),
341                    loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless));
342       if (mask & 0x2) {
343          int d = util_bitcount(mask & 0x1);
344          bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d),
345                    loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless));
346       }
347    }
348 
349    return true;
350 }
351 
352 //
353 // - add quadop dance for texturing
354 // - put FP outputs in GPRs
355 // - convert instruction sequences
356 //
357 bool
visit(Instruction * i)358 GM107LoweringPass::visit(Instruction *i)
359 {
360    bld.setPosition(i, false);
361 
362    if (i->cc != CC_ALWAYS)
363       checkPredicate(i);
364 
365    switch (i->op) {
366    case OP_PFETCH:
367       return handlePFETCH(i);
368    case OP_DFDX:
369    case OP_DFDY:
370       return handleDFDX(i);
371    case OP_POPCNT:
372       return handlePOPCNT(i);
373    case OP_SUQ:
374       return handleSUQ(i->asTex());
375    default:
376       return NVC0LoweringPass::visit(i);
377    }
378 }
379 
380 } // namespace nv50_ir
381