1 /*
2 * Copyright 2011 Christoph Bumiller
3 * 2014 Red Hat Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "codegen/nv50_ir.h"
25 #include "codegen/nv50_ir_build_util.h"
26
27 #include "codegen/nv50_ir_target_nvc0.h"
28 #include "codegen/nv50_ir_lowering_gm107.h"
29
30 #include <limits>
31
32 namespace nv50_ir {
33
34 #define QOP_ADD 0
35 #define QOP_SUBR 1
36 #define QOP_SUB 2
37 #define QOP_MOV2 3
38
39 // UL UR LL LR
40 #define QUADOP(q, r, s, t) \
41 ((QOP_##q << 6) | (QOP_##r << 4) | \
42 (QOP_##s << 2) | (QOP_##t << 0))
43
44 #define SHFL_BOUND_QUAD 0x1c03
45
46 void
handlePFETCH(Instruction * i)47 GM107LegalizeSSA::handlePFETCH(Instruction *i)
48 {
49 Value *src0;
50
51 if (i->src(0).getFile() == FILE_GPR && !i->srcExists(1))
52 return;
53
54 bld.setPosition(i, false);
55 src0 = bld.getSSA();
56
57 if (i->srcExists(1))
58 bld.mkOp2(OP_ADD , TYPE_U32, src0, i->getSrc(0), i->getSrc(1));
59 else
60 bld.mkOp1(OP_MOV , TYPE_U32, src0, i->getSrc(0));
61
62 i->setSrc(0, src0);
63 i->setSrc(1, NULL);
64 }
65
66 void
handleLOAD(Instruction * i)67 GM107LegalizeSSA::handleLOAD(Instruction *i)
68 {
69 if (i->src(0).getFile() != FILE_MEMORY_CONST)
70 return;
71 if (i->src(0).isIndirect(0))
72 return;
73 if (typeSizeof(i->dType) != 4)
74 return;
75
76 i->op = OP_MOV;
77 }
78
79 bool
visit(Instruction * i)80 GM107LegalizeSSA::visit(Instruction *i)
81 {
82 switch (i->op) {
83 case OP_PFETCH:
84 handlePFETCH(i);
85 break;
86 case OP_LOAD:
87 handleLOAD(i);
88 break;
89 default:
90 break;
91 }
92 return true;
93 }
94
95 bool
handleManualTXD(TexInstruction * i)96 GM107LoweringPass::handleManualTXD(TexInstruction *i)
97 {
98 // See NVC0LoweringPass::handleManualTXD for rationale. This function
99 // implements the same logic, but using SM50-friendly primitives.
100 static const uint8_t qOps[2] =
101 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) };
102 Value *def[4][4];
103 Value *crd[3], *arr, *shadow;
104 Value *tmp;
105 Instruction *tex, *add;
106 Value *quad = bld.mkImm(SHFL_BOUND_QUAD);
107 int l, c;
108 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
109 const int array = i->tex.target.isArray();
110 const int indirect = i->tex.rIndirectSrc >= 0;
111
112 i->op = OP_TEX; // no need to clone dPdx/dPdy later
113
114 for (c = 0; c < dim; ++c)
115 crd[c] = bld.getScratch();
116 arr = bld.getScratch();
117 shadow = bld.getScratch();
118 tmp = bld.getScratch();
119
120 for (l = 0; l < 4; ++l) {
121 Value *src[3], *val;
122 Value *lane = bld.mkImm(l);
123 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
124 // Make sure lane 0 has the appropriate array/depth compare values
125 if (l != 0) {
126 if (array)
127 bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad);
128 if (i->tex.target.isShadow())
129 bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim + indirect), lane, quad);
130 }
131
132 // mov coordinates from lane l to all lanes
133 for (c = 0; c < dim; ++c) {
134 bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad);
135 }
136
137 // add dPdx from lane l to lanes dx
138 for (c = 0; c < dim; ++c) {
139 bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad);
140 add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
141 add->subOp = qOps[0];
142 add->lanes = 1; /* abused for .ndv */
143 }
144
145 // add dPdy from lane l to lanes dy
146 for (c = 0; c < dim; ++c) {
147 bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad);
148 add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
149 add->subOp = qOps[1];
150 add->lanes = 1; /* abused for .ndv */
151 }
152
153 // normalize cube coordinates if necessary
154 if (i->tex.target.isCube()) {
155 for (c = 0; c < 3; ++c)
156 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
157 val = bld.getScratch();
158 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
159 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
160 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
161 for (c = 0; c < 3; ++c)
162 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
163 } else {
164 for (c = 0; c < dim; ++c)
165 src[c] = crd[c];
166 }
167
168 // texture
169 bld.insert(tex = cloneForward(func, i));
170 if (l != 0) {
171 if (array)
172 tex->setSrc(0, arr);
173 if (i->tex.target.isShadow())
174 tex->setSrc(array + dim + indirect, shadow);
175 }
176 for (c = 0; c < dim; ++c)
177 tex->setSrc(c + array, src[c]);
178 // broadcast results from lane 0 to all lanes
179 if (l != 0)
180 for (c = 0; i->defExists(c); ++c)
181 bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), bld.mkImm(0), quad);
182 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
183
184 // save results
185 for (c = 0; i->defExists(c); ++c) {
186 Instruction *mov;
187 def[c][l] = bld.getSSA();
188 mov = bld.mkMov(def[c][l], tex->getDef(c));
189 mov->fixed = 1;
190 mov->lanes = 1 << l;
191 }
192 }
193
194 for (c = 0; i->defExists(c); ++c) {
195 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
196 for (l = 0; l < 4; ++l)
197 u->setSrc(l, def[c][l]);
198 }
199
200 i->bb->remove(i);
201 return true;
202 }
203
204 bool
handleDFDX(Instruction * insn)205 GM107LoweringPass::handleDFDX(Instruction *insn)
206 {
207 Instruction *shfl;
208 int qop = 0, xid = 0;
209
210 switch (insn->op) {
211 case OP_DFDX:
212 qop = QUADOP(SUB, SUBR, SUB, SUBR);
213 xid = 1;
214 break;
215 case OP_DFDY:
216 qop = QUADOP(SUB, SUB, SUBR, SUBR);
217 xid = 2;
218 break;
219 default:
220 assert(!"invalid dfdx opcode");
221 break;
222 }
223
224 shfl = bld.mkOp3(OP_SHFL, TYPE_F32, bld.getScratch(), insn->getSrc(0),
225 bld.mkImm(xid), bld.mkImm(SHFL_BOUND_QUAD));
226 shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY;
227 insn->op = OP_QUADOP;
228 insn->subOp = qop;
229 insn->lanes = 0; /* abused for !.ndv */
230 insn->setSrc(1, insn->getSrc(0));
231 insn->setSrc(0, shfl->getDef(0));
232 return true;
233 }
234
235 bool
handlePFETCH(Instruction * i)236 GM107LoweringPass::handlePFETCH(Instruction *i)
237 {
238 Value *tmp0 = bld.getScratch();
239 Value *tmp1 = bld.getScratch();
240 Value *tmp2 = bld.getScratch();
241 bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));
242 bld.mkOp3(OP_PERMT, TYPE_U32, tmp1, tmp0, bld.mkImm(0x4442), bld.mkImm(0));
243 bld.mkOp3(OP_PERMT, TYPE_U32, tmp0, tmp0, bld.mkImm(0x4440), bld.mkImm(0));
244 if (i->getSrc(1))
245 bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1));
246 else
247 bld.mkOp1(OP_MOV , TYPE_U32, tmp2, i->getSrc(0));
248 bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2);
249 i->setSrc(0, tmp0);
250 i->setSrc(1, NULL);
251 return true;
252 }
253
254 bool
handlePOPCNT(Instruction * i)255 GM107LoweringPass::handlePOPCNT(Instruction *i)
256 {
257 Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(),
258 i->getSrc(0), i->getSrc(1));
259 i->setSrc(0, tmp);
260 i->setSrc(1, NULL);
261 return true;
262 }
263
264 bool
handleSUQ(TexInstruction * suq)265 GM107LoweringPass::handleSUQ(TexInstruction *suq)
266 {
267 Value *ind = suq->getIndirectR();
268 Value *handle;
269 const int slot = suq->tex.r;
270 const int mask = suq->tex.mask;
271
272 if (suq->tex.bindless)
273 handle = ind;
274 else
275 handle = loadTexHandle(ind, slot + 32);
276
277 suq->tex.r = 0xff;
278 suq->tex.s = 0x1f;
279
280 suq->setIndirectR(NULL);
281 suq->setSrc(0, handle);
282 suq->tex.rIndirectSrc = 0;
283 suq->setSrc(1, bld.loadImm(NULL, 0));
284 suq->tex.query = TXQ_DIMS;
285 suq->op = OP_TXQ;
286
287 // We store CUBE / CUBE_ARRAY as a 2D ARRAY. Make sure that depth gets
288 // divided by 6.
289 if (mask & 0x4 && suq->tex.target.isCube()) {
290 int d = util_bitcount(mask & 0x3);
291 bld.setPosition(suq, true);
292 bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d), suq->getDef(d),
293 bld.loadImm(NULL, 6));
294 }
295
296 // Samples come from a different query. If we want both samples and dims,
297 // create a second suq.
298 if (mask & 0x8) {
299 int d = util_bitcount(mask & 0x7);
300 Value *dst = suq->getDef(d);
301 TexInstruction *samples = suq;
302 assert(dst);
303
304 if (mask != 0x8) {
305 suq->setDef(d, NULL);
306 suq->tex.mask &= 0x7;
307 samples = cloneShallow(func, suq);
308 for (int i = 0; i < d; i++)
309 samples->setDef(d, NULL);
310 samples->setDef(0, dst);
311 suq->bb->insertAfter(suq, samples);
312 }
313 samples->tex.mask = 0x4;
314 samples->tex.query = TXQ_TYPE;
315 }
316
317 if (suq->tex.target.isMS()) {
318 bld.setPosition(suq, true);
319
320 if (mask & 0x1)
321 bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0),
322 loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless));
323 if (mask & 0x2) {
324 int d = util_bitcount(mask & 0x1);
325 bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d),
326 loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless));
327 }
328 }
329
330 return true;
331 }
332
333 //
334 // - add quadop dance for texturing
335 // - put FP outputs in GPRs
336 // - convert instruction sequences
337 //
338 bool
visit(Instruction * i)339 GM107LoweringPass::visit(Instruction *i)
340 {
341 bld.setPosition(i, false);
342
343 if (i->cc != CC_ALWAYS)
344 checkPredicate(i);
345
346 switch (i->op) {
347 case OP_PFETCH:
348 return handlePFETCH(i);
349 case OP_DFDX:
350 case OP_DFDY:
351 return handleDFDX(i);
352 case OP_POPCNT:
353 return handlePOPCNT(i);
354 case OP_SUQ:
355 return handleSUQ(i->asTex());
356 default:
357 return NVC0LoweringPass::visit(i);
358 }
359 }
360
361 } // namespace nv50_ir
362