1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "nv50_ir.h"
24 #include "nv50_ir_target.h"
25 #include "nv50_ir_build_util.h"
26
27 extern "C" {
28 #include "util/u_math.h"
29 }
30
31 namespace nv50_ir {
32
33 bool
isNop() const34 Instruction::isNop() const
35 {
36 if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE)
37 return true;
38 if (terminator || join) // XXX: should terminator imply flow ?
39 return false;
40 if (op == OP_ATOM)
41 return false;
42 if (!fixed && op == OP_NOP)
43 return true;
44
45 if (defExists(0) && def(0).rep()->reg.data.id < 0) {
46 for (int d = 1; defExists(d); ++d)
47 if (def(d).rep()->reg.data.id >= 0)
48 WARN("part of vector result is unused !\n");
49 return true;
50 }
51
52 if (op == OP_MOV || op == OP_UNION) {
53 if (!getDef(0)->equals(getSrc(0)))
54 return false;
55 if (op == OP_UNION)
56 if (!getDef(0)->equals(getSrc(1)))
57 return false;
58 return true;
59 }
60
61 return false;
62 }
63
isDead() const64 bool Instruction::isDead() const
65 {
66 if (op == OP_STORE ||
67 op == OP_EXPORT ||
68 op == OP_ATOM ||
69 op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB)
70 return false;
71
72 for (int d = 0; defExists(d); ++d)
73 if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
74 return false;
75
76 if (terminator || asFlow())
77 return false;
78 if (fixed)
79 return false;
80
81 return true;
82 };
83
84 // =============================================================================
85
86 class CopyPropagation : public Pass
87 {
88 private:
89 virtual bool visit(BasicBlock *);
90 };
91
92 // Propagate all MOVs forward to make subsequent optimization easier, except if
93 // the sources stem from a phi, in which case we don't want to mess up potential
94 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
95 bool
visit(BasicBlock * bb)96 CopyPropagation::visit(BasicBlock *bb)
97 {
98 Instruction *mov, *si, *next;
99
100 for (mov = bb->getEntry(); mov; mov = next) {
101 next = mov->next;
102 if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
103 continue;
104 if (mov->getPredicate())
105 continue;
106 if (mov->def(0).getFile() != mov->src(0).getFile())
107 continue;
108 si = mov->getSrc(0)->getInsn();
109 if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
110 // propagate
111 mov->def(0).replace(mov->getSrc(0), false);
112 delete_Instruction(prog, mov);
113 }
114 }
115 return true;
116 }
117
118 // =============================================================================
119
120 class MergeSplits : public Pass
121 {
122 private:
123 virtual bool visit(BasicBlock *);
124 };
125
126 // For SPLIT / MERGE pairs that operate on the same registers, replace the
127 // post-merge def with the SPLIT's source.
128 bool
visit(BasicBlock * bb)129 MergeSplits::visit(BasicBlock *bb)
130 {
131 Instruction *i, *next, *si;
132
133 for (i = bb->getEntry(); i; i = next) {
134 next = i->next;
135 if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
136 continue;
137 si = i->getSrc(0)->getInsn();
138 if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
139 continue;
140 i->def(0).replace(si->getSrc(0), false);
141 delete_Instruction(prog, i);
142 }
143
144 return true;
145 }
146
147 // =============================================================================
148
149 class LoadPropagation : public Pass
150 {
151 private:
152 virtual bool visit(BasicBlock *);
153
154 void checkSwapSrc01(Instruction *);
155
156 bool isCSpaceLoad(Instruction *);
157 bool isImmdLoad(Instruction *);
158 bool isAttribOrSharedLoad(Instruction *);
159 };
160
161 bool
isCSpaceLoad(Instruction * ld)162 LoadPropagation::isCSpaceLoad(Instruction *ld)
163 {
164 return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
165 }
166
167 bool
isImmdLoad(Instruction * ld)168 LoadPropagation::isImmdLoad(Instruction *ld)
169 {
170 if (!ld || (ld->op != OP_MOV) ||
171 ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
172 return false;
173
174 // A 0 can be replaced with a register, so it doesn't count as an immediate.
175 ImmediateValue val;
176 return ld->src(0).getImmediate(val) && !val.isInteger(0);
177 }
178
179 bool
isAttribOrSharedLoad(Instruction * ld)180 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
181 {
182 return ld &&
183 (ld->op == OP_VFETCH ||
184 (ld->op == OP_LOAD &&
185 (ld->src(0).getFile() == FILE_SHADER_INPUT ||
186 ld->src(0).getFile() == FILE_MEMORY_SHARED)));
187 }
188
189 void
checkSwapSrc01(Instruction * insn)190 LoadPropagation::checkSwapSrc01(Instruction *insn)
191 {
192 const Target *targ = prog->getTarget();
193 if (!targ->getOpInfo(insn).commutative) {
194 if (insn->op != OP_SET && insn->op != OP_SLCT &&
195 insn->op != OP_SUB && insn->op != OP_XMAD)
196 return;
197 // XMAD is only commutative if both the CBCC and MRG flags are not set.
198 if (insn->op == OP_XMAD &&
199 (insn->subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK) == NV50_IR_SUBOP_XMAD_CBCC)
200 return;
201 if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG))
202 return;
203 }
204 if (insn->src(1).getFile() != FILE_GPR)
205 return;
206 // This is the special OP_SET used for alphatesting, we can't reverse its
207 // arguments as that will confuse the fixup code.
208 if (insn->op == OP_SET && insn->subOp)
209 return;
210
211 Instruction *i0 = insn->getSrc(0)->getInsn();
212 Instruction *i1 = insn->getSrc(1)->getInsn();
213
214 // Swap sources to inline the less frequently used source. That way,
215 // optimistically, it will eventually be able to remove the instruction.
216 int i0refs = insn->getSrc(0)->refCount();
217 int i1refs = insn->getSrc(1)->refCount();
218
219 if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
220 if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
221 !targ->insnCanLoad(insn, 1, i1) ||
222 i0refs < i1refs)
223 insn->swapSources(0, 1);
224 else
225 return;
226 } else
227 if (isAttribOrSharedLoad(i1)) {
228 if (!isAttribOrSharedLoad(i0))
229 insn->swapSources(0, 1);
230 else
231 return;
232 } else {
233 return;
234 }
235
236 if (insn->op == OP_SET || insn->op == OP_SET_AND ||
237 insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
238 insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
239 else
240 if (insn->op == OP_SLCT)
241 insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
242 else
243 if (insn->op == OP_SUB) {
244 insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
245 insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
246 } else
247 if (insn->op == OP_XMAD) {
248 // swap h1 flags
249 uint16_t h1 = (insn->subOp >> 1 & NV50_IR_SUBOP_XMAD_H1(0)) |
250 (insn->subOp << 1 & NV50_IR_SUBOP_XMAD_H1(1));
251 insn->subOp = (insn->subOp & ~NV50_IR_SUBOP_XMAD_H1_MASK) | h1;
252 }
253 }
254
255 bool
visit(BasicBlock * bb)256 LoadPropagation::visit(BasicBlock *bb)
257 {
258 const Target *targ = prog->getTarget();
259 Instruction *next;
260
261 for (Instruction *i = bb->getEntry(); i; i = next) {
262 next = i->next;
263
264 if (i->op == OP_CALL) // calls have args as sources, they must be in regs
265 continue;
266
267 if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
268 continue;
269
270 if (i->srcExists(1))
271 checkSwapSrc01(i);
272
273 for (int s = 0; i->srcExists(s); ++s) {
274 Instruction *ld = i->getSrc(s)->getInsn();
275
276 if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
277 continue;
278 if (ld->op == OP_LOAD && ld->subOp == NV50_IR_SUBOP_LOAD_LOCKED)
279 continue;
280 if (!targ->insnCanLoad(i, s, ld))
281 continue;
282
283 // propagate !
284 i->setSrc(s, ld->getSrc(0));
285 if (ld->src(0).isIndirect(0))
286 i->setIndirect(s, 0, ld->getIndirect(0, 0));
287
288 if (ld->getDef(0)->refCount() == 0)
289 delete_Instruction(prog, ld);
290 }
291 }
292 return true;
293 }
294
295 // =============================================================================
296
297 class IndirectPropagation : public Pass
298 {
299 private:
300 virtual bool visit(BasicBlock *);
301
302 BuildUtil bld;
303 };
304
305 bool
visit(BasicBlock * bb)306 IndirectPropagation::visit(BasicBlock *bb)
307 {
308 const Target *targ = prog->getTarget();
309 Instruction *next;
310
311 for (Instruction *i = bb->getEntry(); i; i = next) {
312 next = i->next;
313
314 bld.setPosition(i, false);
315
316 for (int s = 0; i->srcExists(s); ++s) {
317 Instruction *insn;
318 ImmediateValue imm;
319 if (!i->src(s).isIndirect(0))
320 continue;
321 insn = i->getIndirect(s, 0)->getInsn();
322 if (!insn)
323 continue;
324 if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
325 if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
326 !insn->src(1).getImmediate(imm) ||
327 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
328 continue;
329 i->setIndirect(s, 0, insn->getSrc(0));
330 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
331 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
332 } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
333 if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
334 !insn->src(1).getImmediate(imm) ||
335 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
336 continue;
337 i->setIndirect(s, 0, insn->getSrc(0));
338 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
339 i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
340 } else if (insn->op == OP_MOV) {
341 if (!insn->src(0).getImmediate(imm) ||
342 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
343 continue;
344 i->setIndirect(s, 0, NULL);
345 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
346 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
347 } else if (insn->op == OP_SHLADD) {
348 if (!insn->src(2).getImmediate(imm) ||
349 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
350 continue;
351 i->setIndirect(s, 0, bld.mkOp2v(
352 OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1)));
353 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
354 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
355 }
356 }
357 }
358 return true;
359 }
360
361 // =============================================================================
362
363 // Evaluate constant expressions.
364 class ConstantFolding : public Pass
365 {
366 public:
ConstantFolding()367 ConstantFolding() : foldCount(0) {}
368 bool foldAll(Program *);
369
370 private:
371 virtual bool visit(BasicBlock *);
372
373 void expr(Instruction *, ImmediateValue&, ImmediateValue&);
374 void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
375 /* true if i was deleted */
376 bool opnd(Instruction *i, ImmediateValue&, int s);
377 void opnd3(Instruction *, ImmediateValue&);
378
379 void unary(Instruction *, const ImmediateValue&);
380
381 void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
382
383 CmpInstruction *findOriginForTestWithZero(Value *);
384
385 bool createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c);
386
387 unsigned int foldCount;
388
389 BuildUtil bld;
390 };
391
392 // TODO: remember generated immediates and only revisit these
393 bool
foldAll(Program * prog)394 ConstantFolding::foldAll(Program *prog)
395 {
396 unsigned int iterCount = 0;
397 do {
398 foldCount = 0;
399 if (!run(prog))
400 return false;
401 } while (foldCount && ++iterCount < 2);
402 return true;
403 }
404
405 bool
visit(BasicBlock * bb)406 ConstantFolding::visit(BasicBlock *bb)
407 {
408 Instruction *i, *next;
409
410 for (i = bb->getEntry(); i; i = next) {
411 next = i->next;
412 if (i->op == OP_MOV || i->op == OP_CALL)
413 continue;
414
415 ImmediateValue src0, src1, src2;
416
417 if (i->srcExists(2) &&
418 i->src(0).getImmediate(src0) &&
419 i->src(1).getImmediate(src1) &&
420 i->src(2).getImmediate(src2)) {
421 expr(i, src0, src1, src2);
422 } else
423 if (i->srcExists(1) &&
424 i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1)) {
425 expr(i, src0, src1);
426 } else
427 if (i->srcExists(0) && i->src(0).getImmediate(src0)) {
428 if (opnd(i, src0, 0))
429 continue;
430 } else
431 if (i->srcExists(1) && i->src(1).getImmediate(src1)) {
432 if (opnd(i, src1, 1))
433 continue;
434 }
435 if (i->srcExists(2) && i->src(2).getImmediate(src2))
436 opnd3(i, src2);
437 }
438 return true;
439 }
440
441 CmpInstruction *
findOriginForTestWithZero(Value * value)442 ConstantFolding::findOriginForTestWithZero(Value *value)
443 {
444 if (!value)
445 return NULL;
446 Instruction *insn = value->getInsn();
447 if (!insn)
448 return NULL;
449
450 if (insn->asCmp() && insn->op != OP_SLCT)
451 return insn->asCmp();
452
453 /* Sometimes mov's will sneak in as a result of other folding. This gets
454 * cleaned up later.
455 */
456 if (insn->op == OP_MOV)
457 return findOriginForTestWithZero(insn->getSrc(0));
458
459 /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
460 if (insn->op == OP_AND) {
461 int s = 0;
462 ImmediateValue imm;
463 if (!insn->src(s).getImmediate(imm)) {
464 s = 1;
465 if (!insn->src(s).getImmediate(imm))
466 return NULL;
467 }
468 if (imm.reg.data.f32 != 1.0f)
469 return NULL;
470 /* TODO: Come up with a way to handle the condition being inverted */
471 if (insn->src(!s).mod != Modifier(0))
472 return NULL;
473 return findOriginForTestWithZero(insn->getSrc(!s));
474 }
475
476 return NULL;
477 }
478
479 void
applyTo(ImmediateValue & imm) const480 Modifier::applyTo(ImmediateValue& imm) const
481 {
482 if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
483 return;
484 switch (imm.reg.type) {
485 case TYPE_F32:
486 if (bits & NV50_IR_MOD_ABS)
487 imm.reg.data.f32 = fabsf(imm.reg.data.f32);
488 if (bits & NV50_IR_MOD_NEG)
489 imm.reg.data.f32 = -imm.reg.data.f32;
490 if (bits & NV50_IR_MOD_SAT) {
491 if (imm.reg.data.f32 < 0.0f)
492 imm.reg.data.f32 = 0.0f;
493 else
494 if (imm.reg.data.f32 > 1.0f)
495 imm.reg.data.f32 = 1.0f;
496 }
497 assert(!(bits & NV50_IR_MOD_NOT));
498 break;
499
500 case TYPE_S8: // NOTE: will be extended
501 case TYPE_S16:
502 case TYPE_S32:
503 case TYPE_U8: // NOTE: treated as signed
504 case TYPE_U16:
505 case TYPE_U32:
506 if (bits & NV50_IR_MOD_ABS)
507 imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
508 imm.reg.data.s32 : -imm.reg.data.s32;
509 if (bits & NV50_IR_MOD_NEG)
510 imm.reg.data.s32 = -imm.reg.data.s32;
511 if (bits & NV50_IR_MOD_NOT)
512 imm.reg.data.s32 = ~imm.reg.data.s32;
513 break;
514
515 case TYPE_F64:
516 if (bits & NV50_IR_MOD_ABS)
517 imm.reg.data.f64 = fabs(imm.reg.data.f64);
518 if (bits & NV50_IR_MOD_NEG)
519 imm.reg.data.f64 = -imm.reg.data.f64;
520 if (bits & NV50_IR_MOD_SAT) {
521 if (imm.reg.data.f64 < 0.0)
522 imm.reg.data.f64 = 0.0;
523 else
524 if (imm.reg.data.f64 > 1.0)
525 imm.reg.data.f64 = 1.0;
526 }
527 assert(!(bits & NV50_IR_MOD_NOT));
528 break;
529
530 default:
531 assert(!"invalid/unhandled type");
532 imm.reg.data.u64 = 0;
533 break;
534 }
535 }
536
537 operation
getOp() const538 Modifier::getOp() const
539 {
540 switch (bits) {
541 case NV50_IR_MOD_ABS: return OP_ABS;
542 case NV50_IR_MOD_NEG: return OP_NEG;
543 case NV50_IR_MOD_SAT: return OP_SAT;
544 case NV50_IR_MOD_NOT: return OP_NOT;
545 case 0:
546 return OP_MOV;
547 default:
548 return OP_CVT;
549 }
550 }
551
552 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1)553 ConstantFolding::expr(Instruction *i,
554 ImmediateValue &imm0, ImmediateValue &imm1)
555 {
556 struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
557 struct Storage res;
558 DataType type = i->dType;
559
560 memset(&res.data, 0, sizeof(res.data));
561
562 switch (i->op) {
563 case OP_SGXT: {
564 int bits = b->data.u32;
565 if (bits) {
566 uint32_t data = a->data.u32 & (0xffffffff >> (32 - bits));
567 if (bits < 32 && (data & (1 << (bits - 1))))
568 data = data - (1 << bits);
569 res.data.u32 = data;
570 }
571 break;
572 }
573 case OP_BMSK:
574 res.data.u32 = ((1 << b->data.u32) - 1) << a->data.u32;
575 break;
576 case OP_MAD:
577 case OP_FMA:
578 case OP_MUL:
579 if (i->dnz && i->dType == TYPE_F32) {
580 if (!isfinite(a->data.f32))
581 a->data.f32 = 0.0f;
582 if (!isfinite(b->data.f32))
583 b->data.f32 = 0.0f;
584 }
585 switch (i->dType) {
586 case TYPE_F32:
587 res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
588 break;
589 case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
590 case TYPE_S32:
591 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
592 res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
593 break;
594 }
595 FALLTHROUGH;
596 case TYPE_U32:
597 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
598 res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
599 break;
600 }
601 res.data.u32 = a->data.u32 * b->data.u32; break;
602 default:
603 return;
604 }
605 break;
606 case OP_DIV:
607 if (b->data.u32 == 0)
608 break;
609 switch (i->dType) {
610 case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
611 case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
612 case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
613 case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
614 default:
615 return;
616 }
617 break;
618 case OP_ADD:
619 switch (i->dType) {
620 case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
621 case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
622 case TYPE_S32:
623 case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
624 default:
625 return;
626 }
627 break;
628 case OP_SUB:
629 switch (i->dType) {
630 case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
631 case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
632 case TYPE_S32:
633 case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
634 default:
635 return;
636 }
637 break;
638 case OP_MAX:
639 switch (i->dType) {
640 case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
641 case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
642 case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
643 case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
644 default:
645 return;
646 }
647 break;
648 case OP_MIN:
649 switch (i->dType) {
650 case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
651 case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
652 case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
653 case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
654 default:
655 return;
656 }
657 break;
658 case OP_AND:
659 res.data.u64 = a->data.u64 & b->data.u64;
660 break;
661 case OP_OR:
662 res.data.u64 = a->data.u64 | b->data.u64;
663 break;
664 case OP_XOR:
665 res.data.u64 = a->data.u64 ^ b->data.u64;
666 break;
667 case OP_SHL:
668 res.data.u32 = a->data.u32 << b->data.u32;
669 break;
670 case OP_SHR:
671 switch (i->dType) {
672 case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
673 case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
674 default:
675 return;
676 }
677 break;
678 case OP_SLCT:
679 if (a->data.u32 != b->data.u32)
680 return;
681 res.data.u32 = a->data.u32;
682 break;
683 case OP_EXTBF: {
684 int offset = b->data.u32 & 0xff;
685 int width = (b->data.u32 >> 8) & 0xff;
686 int rshift = offset;
687 int lshift = 0;
688 if (width == 0) {
689 res.data.u32 = 0;
690 break;
691 }
692 if (width + offset < 32) {
693 rshift = 32 - width;
694 lshift = 32 - width - offset;
695 }
696 if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
697 res.data.u32 = util_bitreverse(a->data.u32);
698 else
699 res.data.u32 = a->data.u32;
700 switch (i->dType) {
701 case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
702 case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
703 default:
704 return;
705 }
706 break;
707 }
708 case OP_POPCNT:
709 res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
710 break;
711 case OP_PFETCH:
712 // The two arguments to pfetch are logically added together. Normally
713 // the second argument will not be constant, but that can happen.
714 res.data.u32 = a->data.u32 + b->data.u32;
715 type = TYPE_U32;
716 break;
717 case OP_MERGE:
718 switch (i->dType) {
719 case TYPE_U64:
720 case TYPE_S64:
721 case TYPE_F64:
722 res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
723 break;
724 default:
725 return;
726 }
727 break;
728 default:
729 return;
730 }
731 ++foldCount;
732
733 i->src(0).mod = Modifier(0);
734 i->src(1).mod = Modifier(0);
735 i->postFactor = 0;
736
737 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
738 i->setSrc(1, NULL);
739
740 i->getSrc(0)->reg.data = res.data;
741 i->getSrc(0)->reg.type = type;
742 i->getSrc(0)->reg.size = typeSizeof(type);
743
744 switch (i->op) {
745 case OP_MAD:
746 case OP_FMA: {
747 ImmediateValue src0, src1;
748 src1 = *i->getSrc(0)->asImm();
749
750 // Move the immediate into position 1, where we know it might be
751 // emittable. However it might not be anyways, as there may be other
752 // restrictions, so move it into a separate LValue.
753 bld.setPosition(i, false);
754 i->op = OP_ADD;
755 i->dnz = 0;
756 i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
757 i->setSrc(0, i->getSrc(2));
758 i->src(0).mod = i->src(2).mod;
759 i->setSrc(2, NULL);
760
761 if (i->src(0).getImmediate(src0))
762 expr(i, src0, src1);
763 else
764 opnd(i, src1, 1);
765 break;
766 }
767 case OP_PFETCH:
768 // Leave PFETCH alone... we just folded its 2 args into 1.
769 break;
770 default:
771 i->op = i->saturate ? OP_SAT : OP_MOV;
772 if (i->saturate)
773 unary(i, *i->getSrc(0)->asImm());
774 break;
775 }
776 i->subOp = 0;
777 }
778
779 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1,ImmediateValue & imm2)780 ConstantFolding::expr(Instruction *i,
781 ImmediateValue &imm0,
782 ImmediateValue &imm1,
783 ImmediateValue &imm2)
784 {
785 struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
786 struct Storage res;
787
788 memset(&res.data, 0, sizeof(res.data));
789
790 switch (i->op) {
791 case OP_LOP3_LUT:
792 for (int n = 0; n < 32; n++) {
793 uint8_t lut = ((a->data.u32 >> n) & 1) << 2 |
794 ((b->data.u32 >> n) & 1) << 1 |
795 ((c->data.u32 >> n) & 1);
796 res.data.u32 |= !!(i->subOp & (1 << lut)) << n;
797 }
798 break;
799 case OP_PERMT:
800 if (!i->subOp) {
801 uint64_t input = (uint64_t)c->data.u32 << 32 | a->data.u32;
802 uint16_t permt = b->data.u32;
803 for (int n = 0 ; n < 4; n++, permt >>= 4)
804 res.data.u32 |= ((input >> ((permt & 0xf) * 8)) & 0xff) << n * 8;
805 } else
806 return;
807 break;
808 case OP_INSBF: {
809 int offset = b->data.u32 & 0xff;
810 int width = (b->data.u32 >> 8) & 0xff;
811 unsigned bitmask = ((1 << width) - 1) << offset;
812 res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
813 break;
814 }
815 case OP_MAD:
816 case OP_FMA: {
817 switch (i->dType) {
818 case TYPE_F32:
819 res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
820 c->data.f32;
821 break;
822 case TYPE_F64:
823 res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
824 break;
825 case TYPE_S32:
826 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
827 res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
828 break;
829 }
830 FALLTHROUGH;
831 case TYPE_U32:
832 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
833 res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
834 break;
835 }
836 res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
837 break;
838 default:
839 return;
840 }
841 break;
842 }
843 case OP_SHLADD:
844 res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
845 break;
846 default:
847 return;
848 }
849
850 ++foldCount;
851 i->src(0).mod = Modifier(0);
852 i->src(1).mod = Modifier(0);
853 i->src(2).mod = Modifier(0);
854
855 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
856 i->setSrc(1, NULL);
857 i->setSrc(2, NULL);
858
859 i->getSrc(0)->reg.data = res.data;
860 i->getSrc(0)->reg.type = i->dType;
861 i->getSrc(0)->reg.size = typeSizeof(i->dType);
862
863 i->op = OP_MOV;
864 }
865
866 void
unary(Instruction * i,const ImmediateValue & imm)867 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
868 {
869 Storage res;
870
871 if (i->dType != TYPE_F32)
872 return;
873 switch (i->op) {
874 case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
875 case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
876 case OP_SAT: res.data.f32 = SATURATE(imm.reg.data.f32); break;
877 case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
878 case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
879 case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
880 case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
881 case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
882 case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
883 case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
884 case OP_PRESIN:
885 case OP_PREEX2:
886 // these should be handled in subsequent OP_SIN/COS/EX2
887 res.data.f32 = imm.reg.data.f32;
888 break;
889 default:
890 return;
891 }
892 i->op = OP_MOV;
893 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
894 i->src(0).mod = Modifier(0);
895 }
896
897 void
tryCollapseChainedMULs(Instruction * mul2,const int s,ImmediateValue & imm2)898 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
899 const int s, ImmediateValue& imm2)
900 {
901 const int t = s ? 0 : 1;
902 Instruction *insn;
903 Instruction *mul1 = NULL; // mul1 before mul2
904 int e = 0;
905 float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
906 ImmediateValue imm1;
907
908 assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
909
910 if (mul2->getSrc(t)->refCount() == 1) {
911 insn = mul2->getSrc(t)->getInsn();
912 if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
913 mul1 = insn;
914 if (mul1 && !mul1->saturate) {
915 int s1;
916
917 if (mul1->src(s1 = 0).getImmediate(imm1) ||
918 mul1->src(s1 = 1).getImmediate(imm1)) {
919 bld.setPosition(mul1, false);
920 // a = mul r, imm1
921 // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
922 mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
923 mul1->src(s1).mod = Modifier(0);
924 mul2->def(0).replace(mul1->getDef(0), false);
925 mul1->saturate = mul2->saturate;
926 } else
927 if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
928 // c = mul a, b
929 // d = mul c, imm -> d = mul_x_imm a, b
930 mul1->postFactor = e;
931 mul2->def(0).replace(mul1->getDef(0), false);
932 if (f < 0)
933 mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
934 mul1->saturate = mul2->saturate;
935 }
936 return;
937 }
938 }
939 if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
940 // b = mul a, imm
941 // d = mul b, c -> d = mul_x_imm a, c
942 int s2, t2;
943 insn = (*mul2->getDef(0)->uses.begin())->getInsn();
944 if (!insn)
945 return;
946 mul1 = mul2;
947 mul2 = NULL;
948 s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
949 t2 = s2 ? 0 : 1;
950 if (insn->op == OP_MUL && insn->dType == TYPE_F32)
951 if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
952 mul2 = insn;
953 if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
954 mul2->postFactor = e;
955 mul2->setSrc(s2, mul1->src(t));
956 if (f < 0)
957 mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
958 }
959 }
960 }
961
962 void
opnd3(Instruction * i,ImmediateValue & imm2)963 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
964 {
965 switch (i->op) {
966 case OP_MAD:
967 case OP_FMA:
968 if (imm2.isInteger(0)) {
969 i->op = OP_MUL;
970 i->setSrc(2, NULL);
971 foldCount++;
972 return;
973 }
974 break;
975 case OP_SHLADD:
976 if (imm2.isInteger(0)) {
977 i->op = OP_SHL;
978 i->setSrc(2, NULL);
979 foldCount++;
980 return;
981 }
982 break;
983 default:
984 return;
985 }
986 }
987
988 bool
createMul(DataType ty,Value * def,Value * a,int64_t b,Value * c)989 ConstantFolding::createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c)
990 {
991 const Target *target = prog->getTarget();
992 int64_t absB = llabs(b);
993
994 //a * (2^shl) -> a << shl
995 if (b >= 0 && util_is_power_of_two_or_zero64(b)) {
996 int shl = util_logbase2_64(b);
997
998 Value *res = c ? bld.getSSA(typeSizeof(ty)) : def;
999 bld.mkOp2(OP_SHL, ty, res, a, bld.mkImm(shl));
1000 if (c)
1001 bld.mkOp2(OP_ADD, ty, def, res, c);
1002
1003 return true;
1004 }
1005
1006 //a * (2^shl + 1) -> a << shl + a
1007 //a * -(2^shl + 1) -> -a << shl + a
1008 //a * (2^shl - 1) -> a << shl - a
1009 //a * -(2^shl - 1) -> -a << shl - a
1010 if (typeSizeof(ty) == 4 &&
1011 (util_is_power_of_two_or_zero64(absB - 1) ||
1012 util_is_power_of_two_or_zero64(absB + 1)) &&
1013 target->isOpSupported(OP_SHLADD, TYPE_U32)) {
1014 bool subA = util_is_power_of_two_or_zero64(absB + 1);
1015 int shl = subA ? util_logbase2_64(absB + 1) : util_logbase2_64(absB - 1);
1016
1017 Value *res = c ? bld.getSSA() : def;
1018 Instruction *insn = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, bld.mkImm(shl), a);
1019 if (b < 0)
1020 insn->src(0).mod = Modifier(NV50_IR_MOD_NEG);
1021 if (subA)
1022 insn->src(2).mod = Modifier(NV50_IR_MOD_NEG);
1023
1024 if (c)
1025 bld.mkOp2(OP_ADD, TYPE_U32, def, res, c);
1026
1027 return true;
1028 }
1029
1030 if (typeSizeof(ty) == 4 && b >= 0 && b <= 0xffff &&
1031 target->isOpSupported(OP_XMAD, TYPE_U32)) {
1032 Value *tmp = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(),
1033 a, bld.mkImm((uint32_t)b), c ? c : bld.mkImm(0));
1034 bld.mkOp3(OP_XMAD, TYPE_U32, def, a, bld.mkImm((uint32_t)b), tmp)->subOp =
1035 NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_H1(0);
1036
1037 return true;
1038 }
1039
1040 return false;
1041 }
1042
1043 bool
opnd(Instruction * i,ImmediateValue & imm0,int s)1044 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
1045 {
1046 const int t = !s;
1047 const operation op = i->op;
1048 Instruction *newi = i;
1049 bool deleted = false;
1050
1051 switch (i->op) {
1052 case OP_SPLIT: {
1053 bld.setPosition(i, false);
1054
1055 uint8_t size = i->getDef(0)->reg.size;
1056 uint8_t bitsize = size * 8;
1057 uint32_t mask = (1ULL << bitsize) - 1;
1058 assert(bitsize <= 32);
1059
1060 uint64_t val = imm0.reg.data.u64;
1061 for (int8_t d = 0; i->defExists(d); ++d) {
1062 Value *def = i->getDef(d);
1063 assert(def->reg.size == size);
1064
1065 newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)),
1066 typeOfSize(size));
1067 val >>= bitsize;
1068 }
1069 delete_Instruction(prog, i);
1070 deleted = true;
1071 break;
1072 }
1073 case OP_MUL:
1074 if (i->dType == TYPE_F32 && !i->precise)
1075 tryCollapseChainedMULs(i, s, imm0);
1076
1077 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
1078 assert(!isFloatType(i->sType));
1079 if (imm0.isInteger(1) && i->dType == TYPE_S32) {
1080 bld.setPosition(i, false);
1081 // Need to set to the sign value, which is a compare.
1082 newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
1083 TYPE_S32, i->getSrc(t), bld.mkImm(0));
1084 delete_Instruction(prog, i);
1085 deleted = true;
1086 } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
1087 // The high bits can't be set in this case (either mul by 0 or
1088 // unsigned by 1)
1089 i->op = OP_MOV;
1090 i->subOp = 0;
1091 i->setSrc(0, new_ImmediateValue(prog, 0u));
1092 i->src(0).mod = Modifier(0);
1093 i->setSrc(1, NULL);
1094 } else if (!imm0.isNegative() && imm0.isPow2()) {
1095 // Translate into a shift
1096 imm0.applyLog2();
1097 i->op = OP_SHR;
1098 i->subOp = 0;
1099 imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
1100 i->setSrc(0, i->getSrc(t));
1101 i->src(0).mod = i->src(t).mod;
1102 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1103 i->src(1).mod = 0;
1104 }
1105 } else
1106 if (imm0.isInteger(0)) {
1107 i->dnz = 0;
1108 i->op = OP_MOV;
1109 i->setSrc(0, new_ImmediateValue(prog, 0u));
1110 i->src(0).mod = Modifier(0);
1111 i->postFactor = 0;
1112 i->setSrc(1, NULL);
1113 } else
1114 if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
1115 if (imm0.isNegative())
1116 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1117 i->dnz = 0;
1118 i->op = i->src(t).mod.getOp();
1119 if (s == 0) {
1120 i->setSrc(0, i->getSrc(1));
1121 i->src(0).mod = i->src(1).mod;
1122 i->src(1).mod = 0;
1123 }
1124 if (i->op != OP_CVT)
1125 i->src(0).mod = 0;
1126 i->setSrc(1, NULL);
1127 } else
1128 if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
1129 if (imm0.isNegative())
1130 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1131 i->op = OP_ADD;
1132 i->dnz = 0;
1133 i->setSrc(s, i->getSrc(t));
1134 i->src(s).mod = i->src(t).mod;
1135 } else
1136 if (!isFloatType(i->dType) && !i->src(t).mod) {
1137 bld.setPosition(i, false);
1138 int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
1139 if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, NULL)) {
1140 delete_Instruction(prog, i);
1141 deleted = true;
1142 }
1143 } else
1144 if (i->postFactor && i->sType == TYPE_F32) {
1145 /* Can't emit a postfactor with an immediate, have to fold it in */
1146 i->setSrc(s, new_ImmediateValue(
1147 prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
1148 i->postFactor = 0;
1149 }
1150 break;
1151 case OP_FMA:
1152 case OP_MAD:
1153 if (imm0.isInteger(0)) {
1154 i->setSrc(0, i->getSrc(2));
1155 i->src(0).mod = i->src(2).mod;
1156 i->setSrc(1, NULL);
1157 i->setSrc(2, NULL);
1158 i->dnz = 0;
1159 i->op = i->src(0).mod.getOp();
1160 if (i->op != OP_CVT)
1161 i->src(0).mod = 0;
1162 } else
1163 if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
1164 (imm0.isInteger(1) || imm0.isInteger(-1))) {
1165 if (imm0.isNegative())
1166 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1167 if (s == 0) {
1168 i->setSrc(0, i->getSrc(1));
1169 i->src(0).mod = i->src(1).mod;
1170 }
1171 i->setSrc(1, i->getSrc(2));
1172 i->src(1).mod = i->src(2).mod;
1173 i->setSrc(2, NULL);
1174 i->dnz = 0;
1175 i->op = OP_ADD;
1176 } else
1177 if (!isFloatType(i->dType) && !i->subOp && !i->src(t).mod && !i->src(2).mod) {
1178 bld.setPosition(i, false);
1179 int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
1180 if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, i->getSrc(2))) {
1181 delete_Instruction(prog, i);
1182 deleted = true;
1183 }
1184 }
1185 break;
1186 case OP_SUB:
1187 if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
1188 !isFloatType(i->dType))
1189 break;
1190 FALLTHROUGH;
1191 case OP_ADD:
1192 if (i->usesFlags())
1193 break;
1194 if (imm0.isInteger(0)) {
1195 if (s == 0) {
1196 i->setSrc(0, i->getSrc(1));
1197 i->src(0).mod = i->src(1).mod;
1198 if (i->op == OP_SUB)
1199 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1200 }
1201 i->setSrc(1, NULL);
1202 i->op = i->src(0).mod.getOp();
1203 if (i->op != OP_CVT)
1204 i->src(0).mod = Modifier(0);
1205 }
1206 break;
1207
1208 case OP_DIV:
1209 if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
1210 break;
1211 bld.setPosition(i, false);
1212 if (imm0.reg.data.u32 == 0) {
1213 break;
1214 } else
1215 if (imm0.reg.data.u32 == 1) {
1216 i->op = OP_MOV;
1217 i->setSrc(1, NULL);
1218 } else
1219 if (i->dType == TYPE_U32 && imm0.isPow2()) {
1220 i->op = OP_SHR;
1221 i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1222 } else
1223 if (i->dType == TYPE_U32) {
1224 Instruction *mul;
1225 Value *tA, *tB;
1226 const uint32_t d = imm0.reg.data.u32;
1227 uint32_t m;
1228 int r, s;
1229 uint32_t l = util_logbase2(d);
1230 if (((uint32_t)1 << l) < d)
1231 ++l;
1232 m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1233 r = l ? 1 : 0;
1234 s = l ? (l - 1) : 0;
1235
1236 tA = bld.getSSA();
1237 tB = bld.getSSA();
1238 mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1239 bld.loadImm(NULL, m));
1240 mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1241 bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1242 tA = bld.getSSA();
1243 if (r)
1244 bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1245 else
1246 tA = tB;
1247 tB = s ? bld.getSSA() : i->getDef(0);
1248 newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1249 if (s)
1250 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1251
1252 delete_Instruction(prog, i);
1253 deleted = true;
1254 } else
1255 if (imm0.reg.data.s32 == -1) {
1256 i->op = OP_NEG;
1257 i->setSrc(1, NULL);
1258 } else {
1259 LValue *tA, *tB;
1260 LValue *tD;
1261 const int32_t d = imm0.reg.data.s32;
1262 int32_t m;
1263 int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1264 if ((1 << l) < abs(d))
1265 ++l;
1266 if (!l)
1267 l = 1;
1268 m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1269
1270 tA = bld.getSSA();
1271 tB = bld.getSSA();
1272 bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1273 i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1274 if (l > 1)
1275 bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1276 else
1277 tB = tA;
1278 tA = bld.getSSA();
1279 bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1280 tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1281 newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1282 if (d < 0)
1283 bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1284
1285 delete_Instruction(prog, i);
1286 deleted = true;
1287 }
1288 break;
1289
1290 case OP_MOD:
1291 if (s == 1 && imm0.isPow2()) {
1292 bld.setPosition(i, false);
1293 if (i->sType == TYPE_U32) {
1294 i->op = OP_AND;
1295 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1296 } else if (i->sType == TYPE_S32) {
1297 // Do it on the absolute value of the input, and then restore the
1298 // sign. The only odd case is MIN_INT, but that should work out
1299 // as well, since MIN_INT mod any power of 2 is 0.
1300 //
1301 // Technically we don't have to do any of this since MOD is
1302 // undefined with negative arguments in GLSL, but this seems like
1303 // the nice thing to do.
1304 Value *abs = bld.mkOp1v(OP_ABS, TYPE_S32, bld.getSSA(), i->getSrc(0));
1305 Value *neg, *v1, *v2;
1306 bld.mkCmp(OP_SET, CC_LT, TYPE_S32,
1307 (neg = bld.getSSA(1, prog->getTarget()->nativeFile(FILE_PREDICATE))),
1308 TYPE_S32, i->getSrc(0), bld.loadImm(NULL, 0));
1309 Value *mod = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), abs,
1310 bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1311 bld.mkOp1(OP_NEG, TYPE_S32, (v1 = bld.getSSA()), mod)
1312 ->setPredicate(CC_P, neg);
1313 bld.mkOp1(OP_MOV, TYPE_S32, (v2 = bld.getSSA()), mod)
1314 ->setPredicate(CC_NOT_P, neg);
1315 newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
1316
1317 delete_Instruction(prog, i);
1318 deleted = true;
1319 }
1320 } else if (s == 1) {
1321 // In this case, we still want the optimized lowering that we get
1322 // from having division by an immediate.
1323 //
1324 // a % b == a - (a/b) * b
1325 bld.setPosition(i, false);
1326 Value *div = bld.mkOp2v(OP_DIV, i->sType, bld.getSSA(),
1327 i->getSrc(0), i->getSrc(1));
1328 newi = bld.mkOp2(OP_ADD, i->sType, i->getDef(0), i->getSrc(0),
1329 bld.mkOp2v(OP_MUL, i->sType, bld.getSSA(), div, i->getSrc(1)));
1330 // TODO: Check that target supports this. In this case, we know that
1331 // all backends do.
1332 newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
1333
1334 delete_Instruction(prog, i);
1335 deleted = true;
1336 }
1337 break;
1338
1339 case OP_SET: // TODO: SET_AND,OR,XOR
1340 {
1341 /* This optimizes the case where the output of a set is being compared
1342 * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1343 * can be a lot cleverer in our comparison.
1344 */
1345 CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1346 CondCode cc, ccZ;
1347 if (imm0.reg.data.u32 != 0 || !si)
1348 return false;
1349 cc = si->setCond;
1350 ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1351 // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1352 // first.
1353 if (s == 0)
1354 ccZ = reverseCondCode(ccZ);
1355 // If there is a negative modifier, we need to undo that, by flipping
1356 // the comparison to zero.
1357 if (i->src(t).mod.neg())
1358 ccZ = reverseCondCode(ccZ);
1359 // If this is a signed comparison, we expect the input to be a regular
1360 // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1361 // is positive, so just flip the sign.
1362 if (i->sType == TYPE_S32) {
1363 assert(!isFloatType(si->dType));
1364 ccZ = reverseCondCode(ccZ);
1365 }
1366 switch (ccZ) {
1367 case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1368 case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1369 case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1370 case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1371 case CC_GT: break; // bool > 0 -- bool
1372 case CC_NE: break; // bool != 0 -- bool
1373 default:
1374 return false;
1375 }
1376
1377 // Update the condition of this SET to be identical to the origin set,
1378 // but with the updated condition code. The original SET should get
1379 // DCE'd, ideally.
1380 i->op = si->op;
1381 i->asCmp()->setCond = cc;
1382 i->setSrc(0, si->src(0));
1383 i->setSrc(1, si->src(1));
1384 if (si->srcExists(2))
1385 i->setSrc(2, si->src(2));
1386 i->sType = si->sType;
1387 }
1388 break;
1389
1390 case OP_AND:
1391 {
1392 Instruction *src = i->getSrc(t)->getInsn();
1393 ImmediateValue imm1;
1394 if (imm0.reg.data.u32 == 0) {
1395 i->op = OP_MOV;
1396 i->setSrc(0, new_ImmediateValue(prog, 0u));
1397 i->src(0).mod = Modifier(0);
1398 i->setSrc(1, NULL);
1399 } else if (imm0.reg.data.u32 == ~0U) {
1400 i->op = i->src(t).mod.getOp();
1401 if (t) {
1402 i->setSrc(0, i->getSrc(t));
1403 i->src(0).mod = i->src(t).mod;
1404 }
1405 i->setSrc(1, NULL);
1406 } else if (src->asCmp()) {
1407 CmpInstruction *cmp = src->asCmp();
1408 if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1409 return false;
1410 if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1411 return false;
1412 if (imm0.reg.data.f32 != 1.0)
1413 return false;
1414 if (cmp->dType != TYPE_U32)
1415 return false;
1416
1417 cmp->dType = TYPE_F32;
1418 if (i->src(t).mod != Modifier(0)) {
1419 assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1420 i->src(t).mod = Modifier(0);
1421 cmp->setCond = inverseCondCode(cmp->setCond);
1422 }
1423 i->op = OP_MOV;
1424 i->setSrc(s, NULL);
1425 if (t) {
1426 i->setSrc(0, i->getSrc(t));
1427 i->setSrc(t, NULL);
1428 }
1429 } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1430 src->op == OP_SHR &&
1431 src->src(1).getImmediate(imm1) &&
1432 i->src(t).mod == Modifier(0) &&
1433 util_is_power_of_two_or_zero(imm0.reg.data.u32 + 1)) {
1434 // low byte = offset, high byte = width
1435 uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1436 i->op = OP_EXTBF;
1437 i->setSrc(0, src->getSrc(0));
1438 i->setSrc(1, new_ImmediateValue(prog, ext));
1439 } else if (src->op == OP_SHL &&
1440 src->src(1).getImmediate(imm1) &&
1441 i->src(t).mod == Modifier(0) &&
1442 util_is_power_of_two_or_zero(~imm0.reg.data.u32 + 1) &&
1443 util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
1444 i->op = OP_MOV;
1445 i->setSrc(s, NULL);
1446 if (t) {
1447 i->setSrc(0, i->getSrc(t));
1448 i->setSrc(t, NULL);
1449 }
1450 }
1451 }
1452 break;
1453
1454 case OP_SHL:
1455 {
1456 if (s != 1 || i->src(0).mod != Modifier(0))
1457 break;
1458
1459 if (imm0.reg.data.u32 == 0) {
1460 i->op = OP_MOV;
1461 i->setSrc(1, NULL);
1462 break;
1463 }
1464 // try to concatenate shifts
1465 Instruction *si = i->getSrc(0)->getInsn();
1466 if (!si)
1467 break;
1468 ImmediateValue imm1;
1469 switch (si->op) {
1470 case OP_SHL:
1471 if (si->src(1).getImmediate(imm1)) {
1472 bld.setPosition(i, false);
1473 i->setSrc(0, si->getSrc(0));
1474 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1475 }
1476 break;
1477 case OP_SHR:
1478 if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
1479 bld.setPosition(i, false);
1480 i->op = OP_AND;
1481 i->setSrc(0, si->getSrc(0));
1482 i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
1483 }
1484 break;
1485 case OP_MUL:
1486 int muls;
1487 if (isFloatType(si->dType))
1488 return false;
1489 if (si->subOp)
1490 return false;
1491 if (si->src(1).getImmediate(imm1))
1492 muls = 1;
1493 else if (si->src(0).getImmediate(imm1))
1494 muls = 0;
1495 else
1496 return false;
1497
1498 bld.setPosition(i, false);
1499 i->op = OP_MUL;
1500 i->subOp = 0;
1501 i->dType = si->dType;
1502 i->sType = si->sType;
1503 i->setSrc(0, si->getSrc(!muls));
1504 i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1505 break;
1506 case OP_SUB:
1507 case OP_ADD:
1508 int adds;
1509 if (isFloatType(si->dType))
1510 return false;
1511 if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1512 adds = 0;
1513 else if (si->src(1).getImmediate(imm1))
1514 adds = 1;
1515 else
1516 return false;
1517 if (si->src(!adds).mod != Modifier(0))
1518 return false;
1519 // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1520
1521 // This is more operations, but if one of x, y is an immediate, then
1522 // we can get a situation where (a) we can use ISCADD, or (b)
1523 // propagate the add bit into an indirect load.
1524 bld.setPosition(i, false);
1525 i->op = si->op;
1526 i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1527 i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1528 bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1529 si->getSrc(!adds),
1530 bld.mkImm(imm0.reg.data.u32)));
1531 break;
1532 default:
1533 return false;
1534 }
1535 }
1536 break;
1537
1538 case OP_ABS:
1539 case OP_NEG:
1540 case OP_SAT:
1541 case OP_LG2:
1542 case OP_RCP:
1543 case OP_SQRT:
1544 case OP_RSQ:
1545 case OP_PRESIN:
1546 case OP_SIN:
1547 case OP_COS:
1548 case OP_PREEX2:
1549 case OP_EX2:
1550 unary(i, imm0);
1551 break;
1552 case OP_BFIND: {
1553 int32_t res;
1554 switch (i->dType) {
1555 case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1556 case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1557 default:
1558 return false;
1559 }
1560 if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1561 res = 31 - res;
1562 bld.setPosition(i, false); /* make sure bld is init'ed */
1563 i->setSrc(0, bld.mkImm(res));
1564 i->setSrc(1, NULL);
1565 i->op = OP_MOV;
1566 i->subOp = 0;
1567 break;
1568 }
1569 case OP_BREV: {
1570 uint32_t res = util_bitreverse(imm0.reg.data.u32);
1571 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1572 i->op = OP_MOV;
1573 break;
1574 }
1575 case OP_POPCNT: {
1576 // Only deal with 1-arg POPCNT here
1577 if (i->srcExists(1))
1578 break;
1579 uint32_t res = util_bitcount(imm0.reg.data.u32);
1580 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1581 i->setSrc(1, NULL);
1582 i->op = OP_MOV;
1583 break;
1584 }
1585 case OP_CVT: {
1586 Storage res;
1587
1588 // TODO: handle 64-bit values properly
1589 if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1590 return false;
1591
1592 // TODO: handle single byte/word extractions
1593 if (i->subOp)
1594 return false;
1595
1596 bld.setPosition(i, true); /* make sure bld is init'ed */
1597
1598 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1599 case type: \
1600 switch (i->sType) { \
1601 case TYPE_F64: \
1602 res.data.dst = util_iround(i->saturate ? \
1603 CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1604 imm0.reg.data.f64); \
1605 break; \
1606 case TYPE_F32: \
1607 res.data.dst = util_iround(i->saturate ? \
1608 CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1609 imm0.reg.data.f32); \
1610 break; \
1611 case TYPE_S32: \
1612 res.data.dst = i->saturate ? \
1613 CLAMP(imm0.reg.data.s32, imin, imax) : \
1614 imm0.reg.data.s32; \
1615 break; \
1616 case TYPE_U32: \
1617 res.data.dst = i->saturate ? \
1618 CLAMP(imm0.reg.data.u32, umin, umax) : \
1619 imm0.reg.data.u32; \
1620 break; \
1621 case TYPE_S16: \
1622 res.data.dst = i->saturate ? \
1623 CLAMP(imm0.reg.data.s16, imin, imax) : \
1624 imm0.reg.data.s16; \
1625 break; \
1626 case TYPE_U16: \
1627 res.data.dst = i->saturate ? \
1628 CLAMP(imm0.reg.data.u16, umin, umax) : \
1629 imm0.reg.data.u16; \
1630 break; \
1631 default: return false; \
1632 } \
1633 i->setSrc(0, bld.mkImm(res.data.dst)); \
1634 break
1635
1636 switch(i->dType) {
1637 CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1638 CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1639 CASE(TYPE_U32, u32, 0, (float)UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1640 CASE(TYPE_S32, s32, (float)INT32_MIN, (float)INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1641 case TYPE_F32:
1642 switch (i->sType) {
1643 case TYPE_F64:
1644 res.data.f32 = i->saturate ?
1645 SATURATE(imm0.reg.data.f64) :
1646 imm0.reg.data.f64;
1647 break;
1648 case TYPE_F32:
1649 res.data.f32 = i->saturate ?
1650 SATURATE(imm0.reg.data.f32) :
1651 imm0.reg.data.f32;
1652 break;
1653 case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1654 case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1655 case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1656 case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1657 default:
1658 return false;
1659 }
1660 i->setSrc(0, bld.mkImm(res.data.f32));
1661 break;
1662 case TYPE_F64:
1663 switch (i->sType) {
1664 case TYPE_F64:
1665 res.data.f64 = i->saturate ?
1666 SATURATE(imm0.reg.data.f64) :
1667 imm0.reg.data.f64;
1668 break;
1669 case TYPE_F32:
1670 res.data.f64 = i->saturate ?
1671 SATURATE(imm0.reg.data.f32) :
1672 imm0.reg.data.f32;
1673 break;
1674 case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1675 case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1676 case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1677 case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1678 default:
1679 return false;
1680 }
1681 i->setSrc(0, bld.mkImm(res.data.f64));
1682 break;
1683 default:
1684 return false;
1685 }
1686 #undef CASE
1687
1688 i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1689 i->op = OP_MOV;
1690 i->saturate = 0;
1691 i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1692 break;
1693 }
1694 default:
1695 return false;
1696 }
1697
1698 // This can get left behind some of the optimizations which simplify
1699 // saturatable values.
1700 if (newi->op == OP_MOV && newi->saturate) {
1701 ImmediateValue tmp;
1702 newi->saturate = 0;
1703 newi->op = OP_SAT;
1704 if (newi->src(0).getImmediate(tmp))
1705 unary(newi, tmp);
1706 }
1707
1708 if (newi->op != op)
1709 foldCount++;
1710 return deleted;
1711 }
1712
1713 // =============================================================================
1714
1715 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1716 class ModifierFolding : public Pass
1717 {
1718 private:
1719 virtual bool visit(BasicBlock *);
1720 };
1721
1722 bool
visit(BasicBlock * bb)1723 ModifierFolding::visit(BasicBlock *bb)
1724 {
1725 const Target *target = prog->getTarget();
1726
1727 Instruction *i, *next, *mi;
1728 Modifier mod;
1729
1730 for (i = bb->getEntry(); i; i = next) {
1731 next = i->next;
1732
1733 if (false && i->op == OP_SUB) {
1734 // turn "sub" into "add neg" (do we really want this ?)
1735 i->op = OP_ADD;
1736 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1737 }
1738
1739 for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1740 mi = i->getSrc(s)->getInsn();
1741 if (!mi ||
1742 mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1743 continue;
1744 if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1745 if ((i->op != OP_ADD &&
1746 i->op != OP_MUL) ||
1747 (mi->op != OP_ABS &&
1748 mi->op != OP_NEG))
1749 continue;
1750 } else
1751 if (i->sType != mi->dType) {
1752 continue;
1753 }
1754 if ((mod = Modifier(mi->op)) == Modifier(0))
1755 continue;
1756 mod *= mi->src(0).mod;
1757
1758 if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1759 // abs neg [abs] = abs
1760 mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1761 } else
1762 if ((i->op == OP_NEG) && mod.neg()) {
1763 assert(s == 0);
1764 // neg as both opcode and modifier on same insn is prohibited
1765 // neg neg abs = abs, neg neg = identity
1766 mod = mod & Modifier(~NV50_IR_MOD_NEG);
1767 i->op = mod.getOp();
1768 mod = mod & Modifier(~NV50_IR_MOD_ABS);
1769 if (mod == Modifier(0))
1770 i->op = OP_MOV;
1771 }
1772
1773 if (target->isModSupported(i, s, mod)) {
1774 i->setSrc(s, mi->getSrc(0));
1775 i->src(s).mod *= mod;
1776 }
1777 }
1778
1779 if (i->op == OP_SAT) {
1780 mi = i->getSrc(0)->getInsn();
1781 if (mi &&
1782 mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1783 mi->saturate = 1;
1784 mi->setDef(0, i->getDef(0));
1785 delete_Instruction(prog, i);
1786 }
1787 }
1788 }
1789
1790 return true;
1791 }
1792
1793 // =============================================================================
1794
1795 // MUL + ADD -> MAD/FMA
1796 // MIN/MAX(a, a) -> a, etc.
1797 // SLCT(a, b, const) -> cc(const) ? a : b
1798 // RCP(RCP(a)) -> a
1799 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1800 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
1801 class AlgebraicOpt : public Pass
1802 {
1803 private:
1804 virtual bool visit(BasicBlock *);
1805
1806 void handleABS(Instruction *);
1807 bool handleADD(Instruction *);
1808 bool tryADDToMADOrSAD(Instruction *, operation toOp);
1809 void handleMINMAX(Instruction *);
1810 void handleRCP(Instruction *);
1811 void handleSLCT(Instruction *);
1812 void handleLOGOP(Instruction *);
1813 void handleCVT_NEG(Instruction *);
1814 void handleCVT_CVT(Instruction *);
1815 void handleCVT_EXTBF(Instruction *);
1816 void handleSUCLAMP(Instruction *);
1817 void handleNEG(Instruction *);
1818 void handleEXTBF_RDSV(Instruction *);
1819
1820 BuildUtil bld;
1821 };
1822
1823 void
handleABS(Instruction * abs)1824 AlgebraicOpt::handleABS(Instruction *abs)
1825 {
1826 Instruction *sub = abs->getSrc(0)->getInsn();
1827 DataType ty;
1828 if (!sub ||
1829 !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1830 return;
1831 // hidden conversion ?
1832 ty = intTypeToSigned(sub->dType);
1833 if (abs->dType != abs->sType || ty != abs->sType)
1834 return;
1835
1836 if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1837 sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1838 sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1839 return;
1840
1841 Value *src0 = sub->getSrc(0);
1842 Value *src1 = sub->getSrc(1);
1843
1844 if (sub->op == OP_ADD) {
1845 Instruction *neg = sub->getSrc(1)->getInsn();
1846 if (neg && neg->op != OP_NEG) {
1847 neg = sub->getSrc(0)->getInsn();
1848 src0 = sub->getSrc(1);
1849 }
1850 if (!neg || neg->op != OP_NEG ||
1851 neg->dType != neg->sType || neg->sType != ty)
1852 return;
1853 src1 = neg->getSrc(0);
1854 }
1855
1856 // found ABS(SUB))
1857 abs->moveSources(1, 2); // move sources >=1 up by 2
1858 abs->op = OP_SAD;
1859 abs->setType(sub->dType);
1860 abs->setSrc(0, src0);
1861 abs->setSrc(1, src1);
1862 bld.setPosition(abs, false);
1863 abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1864 }
1865
1866 bool
handleADD(Instruction * add)1867 AlgebraicOpt::handleADD(Instruction *add)
1868 {
1869 Value *src0 = add->getSrc(0);
1870 Value *src1 = add->getSrc(1);
1871
1872 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1873 return false;
1874
1875 bool changed = false;
1876 // we can't optimize to MAD if the add is precise
1877 if (!add->precise && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1878 changed = tryADDToMADOrSAD(add, OP_MAD);
1879 if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1880 changed = tryADDToMADOrSAD(add, OP_SAD);
1881 return changed;
1882 }
1883
1884 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1885 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1886 bool
tryADDToMADOrSAD(Instruction * add,operation toOp)1887 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1888 {
1889 Value *src0 = add->getSrc(0);
1890 Value *src1 = add->getSrc(1);
1891 Value *src;
1892 int s;
1893 const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1894 const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1895 Modifier mod[4];
1896
1897 if (src0->refCount() == 1 &&
1898 src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1899 s = 0;
1900 else
1901 if (src1->refCount() == 1 &&
1902 src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1903 s = 1;
1904 else
1905 return false;
1906
1907 src = add->getSrc(s);
1908
1909 if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1910 return false;
1911
1912 if (src->getInsn()->saturate || src->getInsn()->postFactor ||
1913 src->getInsn()->dnz || src->getInsn()->precise)
1914 return false;
1915
1916 if (toOp == OP_SAD) {
1917 ImmediateValue imm;
1918 if (!src->getInsn()->src(2).getImmediate(imm))
1919 return false;
1920 if (!imm.isInteger(0))
1921 return false;
1922 }
1923
1924 if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1925 isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1926 return false;
1927
1928 mod[0] = add->src(0).mod;
1929 mod[1] = add->src(1).mod;
1930 mod[2] = src->getUniqueInsn()->src(0).mod;
1931 mod[3] = src->getUniqueInsn()->src(1).mod;
1932
1933 if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1934 return false;
1935
1936 add->op = toOp;
1937 add->subOp = src->getInsn()->subOp; // potentially mul-high
1938 add->dnz = src->getInsn()->dnz;
1939 add->dType = src->getInsn()->dType; // sign matters for imad hi
1940 add->sType = src->getInsn()->sType;
1941
1942 add->setSrc(2, add->src(s ? 0 : 1));
1943
1944 add->setSrc(0, src->getInsn()->getSrc(0));
1945 add->src(0).mod = mod[2] ^ mod[s];
1946 add->setSrc(1, src->getInsn()->getSrc(1));
1947 add->src(1).mod = mod[3];
1948
1949 return true;
1950 }
1951
1952 void
handleMINMAX(Instruction * minmax)1953 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1954 {
1955 Value *src0 = minmax->getSrc(0);
1956 Value *src1 = minmax->getSrc(1);
1957
1958 if (src0 != src1 || src0->reg.file != FILE_GPR)
1959 return;
1960 if (minmax->src(0).mod == minmax->src(1).mod) {
1961 if (minmax->def(0).mayReplace(minmax->src(0))) {
1962 minmax->def(0).replace(minmax->src(0), false);
1963 delete_Instruction(prog, minmax);
1964 } else {
1965 minmax->op = OP_CVT;
1966 minmax->setSrc(1, NULL);
1967 }
1968 } else {
1969 // TODO:
1970 // min(x, -x) = -abs(x)
1971 // min(x, -abs(x)) = -abs(x)
1972 // min(x, abs(x)) = x
1973 // max(x, -abs(x)) = x
1974 // max(x, abs(x)) = abs(x)
1975 // max(x, -x) = abs(x)
1976 }
1977 }
1978
1979 // rcp(rcp(a)) = a
1980 // rcp(sqrt(a)) = rsq(a)
1981 void
handleRCP(Instruction * rcp)1982 AlgebraicOpt::handleRCP(Instruction *rcp)
1983 {
1984 Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1985
1986 if (!si)
1987 return;
1988
1989 if (si->op == OP_RCP) {
1990 Modifier mod = rcp->src(0).mod * si->src(0).mod;
1991 rcp->op = mod.getOp();
1992 rcp->setSrc(0, si->getSrc(0));
1993 } else if (si->op == OP_SQRT) {
1994 rcp->op = OP_RSQ;
1995 rcp->setSrc(0, si->getSrc(0));
1996 rcp->src(0).mod = rcp->src(0).mod * si->src(0).mod;
1997 }
1998 }
1999
2000 void
handleSLCT(Instruction * slct)2001 AlgebraicOpt::handleSLCT(Instruction *slct)
2002 {
2003 if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
2004 if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
2005 slct->setSrc(0, slct->getSrc(1));
2006 } else
2007 if (slct->getSrc(0) != slct->getSrc(1)) {
2008 return;
2009 }
2010 slct->op = OP_MOV;
2011 slct->setSrc(1, NULL);
2012 slct->setSrc(2, NULL);
2013 }
2014
2015 void
handleLOGOP(Instruction * logop)2016 AlgebraicOpt::handleLOGOP(Instruction *logop)
2017 {
2018 Value *src0 = logop->getSrc(0);
2019 Value *src1 = logop->getSrc(1);
2020
2021 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2022 return;
2023
2024 if (src0 == src1) {
2025 if ((logop->op == OP_AND || logop->op == OP_OR) &&
2026 logop->def(0).mayReplace(logop->src(0))) {
2027 logop->def(0).replace(logop->src(0), false);
2028 delete_Instruction(prog, logop);
2029 }
2030 } else {
2031 // try AND(SET, SET) -> SET_AND(SET)
2032 Instruction *set0 = src0->getInsn();
2033 Instruction *set1 = src1->getInsn();
2034
2035 if (!set0 || set0->fixed || !set1 || set1->fixed)
2036 return;
2037 if (set1->op != OP_SET) {
2038 Instruction *xchg = set0;
2039 set0 = set1;
2040 set1 = xchg;
2041 if (set1->op != OP_SET)
2042 return;
2043 }
2044 operation redOp = (logop->op == OP_AND ? OP_SET_AND :
2045 logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
2046 if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
2047 return;
2048 if (set0->op != OP_SET &&
2049 set0->op != OP_SET_AND &&
2050 set0->op != OP_SET_OR &&
2051 set0->op != OP_SET_XOR)
2052 return;
2053 if (set0->getDef(0)->refCount() > 1 &&
2054 set1->getDef(0)->refCount() > 1)
2055 return;
2056 if (set0->getPredicate() || set1->getPredicate())
2057 return;
2058 // check that they don't source each other
2059 for (int s = 0; s < 2; ++s)
2060 if (set0->getSrc(s) == set1->getDef(0) ||
2061 set1->getSrc(s) == set0->getDef(0))
2062 return;
2063
2064 set0 = cloneForward(func, set0);
2065 set1 = cloneShallow(func, set1);
2066 logop->bb->insertAfter(logop, set1);
2067 logop->bb->insertAfter(logop, set0);
2068
2069 set0->dType = TYPE_U8;
2070 set0->getDef(0)->reg.file = FILE_PREDICATE;
2071 set0->getDef(0)->reg.size = 1;
2072 set1->setSrc(2, set0->getDef(0));
2073 set1->op = redOp;
2074 set1->setDef(0, logop->getDef(0));
2075 delete_Instruction(prog, logop);
2076 }
2077 }
2078
2079 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
2080 // nv50:
2081 // F2I(NEG(I2F(ABS(SET))))
2082 void
handleCVT_NEG(Instruction * cvt)2083 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
2084 {
2085 Instruction *insn = cvt->getSrc(0)->getInsn();
2086 if (cvt->sType != TYPE_F32 ||
2087 cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
2088 return;
2089 if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
2090 return;
2091 if (insn->src(0).mod != Modifier(0))
2092 return;
2093 insn = insn->getSrc(0)->getInsn();
2094
2095 // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
2096 if (insn && insn->op == OP_CVT &&
2097 insn->dType == TYPE_F32 &&
2098 insn->sType == TYPE_S32) {
2099 insn = insn->getSrc(0)->getInsn();
2100 if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
2101 insn->src(0).mod)
2102 return;
2103 insn = insn->getSrc(0)->getInsn();
2104 if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
2105 return;
2106 } else
2107 if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
2108 return;
2109 }
2110
2111 Instruction *bset = cloneShallow(func, insn);
2112 bset->dType = TYPE_U32;
2113 bset->setDef(0, cvt->getDef(0));
2114 cvt->bb->insertAfter(cvt, bset);
2115 delete_Instruction(prog, cvt);
2116 }
2117
2118 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
2119 // does a type conversion, this becomes trickier as there might be range
2120 // changes/etc. We could handle those in theory as long as the range was being
2121 // reduced or kept the same.
2122 void
handleCVT_CVT(Instruction * cvt)2123 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
2124 {
2125 Instruction *insn = cvt->getSrc(0)->getInsn();
2126
2127 if (!insn ||
2128 insn->saturate ||
2129 insn->subOp ||
2130 insn->dType != insn->sType ||
2131 insn->dType != cvt->sType)
2132 return;
2133
2134 RoundMode rnd = insn->rnd;
2135 switch (insn->op) {
2136 case OP_CEIL:
2137 rnd = ROUND_PI;
2138 break;
2139 case OP_FLOOR:
2140 rnd = ROUND_MI;
2141 break;
2142 case OP_TRUNC:
2143 rnd = ROUND_ZI;
2144 break;
2145 case OP_CVT:
2146 break;
2147 default:
2148 return;
2149 }
2150
2151 if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
2152 rnd = (RoundMode)(rnd & 3);
2153
2154 cvt->rnd = rnd;
2155 cvt->setSrc(0, insn->getSrc(0));
2156 cvt->src(0).mod *= insn->src(0).mod;
2157 cvt->sType = insn->sType;
2158 }
2159
2160 // Some shaders extract packed bytes out of words and convert them to
2161 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
2162 // nv50 for word sizes.
2163 //
2164 // CVT(EXTBF(x, byte/word))
2165 // CVT(AND(bytemask, x))
2166 // CVT(AND(bytemask, SHR(x, 8/16/24)))
2167 // CVT(SHR(x, 16/24))
2168 void
handleCVT_EXTBF(Instruction * cvt)2169 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
2170 {
2171 Instruction *insn = cvt->getSrc(0)->getInsn();
2172 ImmediateValue imm;
2173 Value *arg = NULL;
2174 unsigned width, offset = 0;
2175 if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
2176 return;
2177 if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
2178 width = (imm.reg.data.u32 >> 8) & 0xff;
2179 offset = imm.reg.data.u32 & 0xff;
2180 arg = insn->getSrc(0);
2181
2182 if (width != 8 && width != 16)
2183 return;
2184 if (width == 8 && offset & 0x7)
2185 return;
2186 if (width == 16 && offset & 0xf)
2187 return;
2188 } else if (insn->op == OP_AND) {
2189 int s;
2190 if (insn->src(0).getImmediate(imm))
2191 s = 0;
2192 else if (insn->src(1).getImmediate(imm))
2193 s = 1;
2194 else
2195 return;
2196
2197 if (imm.reg.data.u32 == 0xff)
2198 width = 8;
2199 else if (imm.reg.data.u32 == 0xffff)
2200 width = 16;
2201 else
2202 return;
2203
2204 arg = insn->getSrc(!s);
2205 Instruction *shift = arg->getInsn();
2206
2207 if (shift && shift->op == OP_SHR &&
2208 shift->sType == cvt->sType &&
2209 shift->src(1).getImmediate(imm) &&
2210 ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2211 (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
2212 arg = shift->getSrc(0);
2213 offset = imm.reg.data.u32;
2214 }
2215 // We just AND'd the high bits away, which means this is effectively an
2216 // unsigned value.
2217 cvt->sType = TYPE_U32;
2218 } else if (insn->op == OP_SHR &&
2219 insn->sType == cvt->sType &&
2220 insn->src(1).getImmediate(imm)) {
2221 arg = insn->getSrc(0);
2222 if (imm.reg.data.u32 == 24) {
2223 width = 8;
2224 offset = 24;
2225 } else if (imm.reg.data.u32 == 16) {
2226 width = 16;
2227 offset = 16;
2228 } else {
2229 return;
2230 }
2231 }
2232
2233 if (!arg)
2234 return;
2235
2236 // Irrespective of what came earlier, we can undo a shift on the argument
2237 // by adjusting the offset.
2238 Instruction *shift = arg->getInsn();
2239 if (shift && shift->op == OP_SHL &&
2240 shift->src(1).getImmediate(imm) &&
2241 ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2242 (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
2243 imm.reg.data.u32 <= offset) {
2244 arg = shift->getSrc(0);
2245 offset -= imm.reg.data.u32;
2246 }
2247
2248 // The unpackSnorm lowering still leaves a few shifts behind, but it's too
2249 // annoying to detect them.
2250
2251 if (width == 8) {
2252 cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
2253 } else {
2254 assert(width == 16);
2255 cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
2256 }
2257 cvt->setSrc(0, arg);
2258 cvt->subOp = offset >> 3;
2259 }
2260
2261 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2262 void
handleSUCLAMP(Instruction * insn)2263 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
2264 {
2265 ImmediateValue imm;
2266 int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
2267 int s;
2268 Instruction *add;
2269
2270 assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
2271
2272 // look for ADD (TODO: only count references by non-SUCLAMP)
2273 if (insn->getSrc(0)->refCount() > 1)
2274 return;
2275 add = insn->getSrc(0)->getInsn();
2276 if (!add || add->op != OP_ADD ||
2277 (add->dType != TYPE_U32 &&
2278 add->dType != TYPE_S32))
2279 return;
2280
2281 // look for immediate
2282 for (s = 0; s < 2; ++s)
2283 if (add->src(s).getImmediate(imm))
2284 break;
2285 if (s >= 2)
2286 return;
2287 s = s ? 0 : 1;
2288 // determine if immediate fits
2289 val += imm.reg.data.s32;
2290 if (val > 31 || val < -32)
2291 return;
2292 // determine if other addend fits
2293 if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
2294 return;
2295
2296 bld.setPosition(insn, false); // make sure bld is init'ed
2297 // replace sources
2298 insn->setSrc(2, bld.mkImm(val));
2299 insn->setSrc(0, add->getSrc(s));
2300 }
2301
2302 // NEG(AND(SET, 1)) -> SET
2303 void
handleNEG(Instruction * i)2304 AlgebraicOpt::handleNEG(Instruction *i) {
2305 Instruction *src = i->getSrc(0)->getInsn();
2306 ImmediateValue imm;
2307 int b;
2308
2309 if (isFloatType(i->sType) || !src || src->op != OP_AND)
2310 return;
2311
2312 if (src->src(0).getImmediate(imm))
2313 b = 1;
2314 else if (src->src(1).getImmediate(imm))
2315 b = 0;
2316 else
2317 return;
2318
2319 if (!imm.isInteger(1))
2320 return;
2321
2322 Instruction *set = src->getSrc(b)->getInsn();
2323 if ((set->op == OP_SET || set->op == OP_SET_AND ||
2324 set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
2325 !isFloatType(set->dType)) {
2326 i->def(0).replace(set->getDef(0), false);
2327 }
2328 }
2329
2330 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
2331 void
handleEXTBF_RDSV(Instruction * i)2332 AlgebraicOpt::handleEXTBF_RDSV(Instruction *i)
2333 {
2334 Instruction *rdsv = i->getSrc(0)->getUniqueInsn();
2335 if (rdsv->op != OP_RDSV ||
2336 rdsv->getSrc(0)->asSym()->reg.data.sv.sv != SV_COMBINED_TID)
2337 return;
2338 // Avoid creating more RDSV instructions
2339 if (rdsv->getDef(0)->refCount() > 1)
2340 return;
2341
2342 ImmediateValue imm;
2343 if (!i->src(1).getImmediate(imm))
2344 return;
2345
2346 int index;
2347 if (imm.isInteger(0x1000))
2348 index = 0;
2349 else
2350 if (imm.isInteger(0x0a10))
2351 index = 1;
2352 else
2353 if (imm.isInteger(0x061a))
2354 index = 2;
2355 else
2356 return;
2357
2358 bld.setPosition(i, false);
2359
2360 i->op = OP_RDSV;
2361 i->setSrc(0, bld.mkSysVal(SV_TID, index));
2362 i->setSrc(1, NULL);
2363 }
2364
2365 bool
visit(BasicBlock * bb)2366 AlgebraicOpt::visit(BasicBlock *bb)
2367 {
2368 Instruction *next;
2369 for (Instruction *i = bb->getEntry(); i; i = next) {
2370 next = i->next;
2371 switch (i->op) {
2372 case OP_ABS:
2373 handleABS(i);
2374 break;
2375 case OP_ADD:
2376 handleADD(i);
2377 break;
2378 case OP_RCP:
2379 handleRCP(i);
2380 break;
2381 case OP_MIN:
2382 case OP_MAX:
2383 handleMINMAX(i);
2384 break;
2385 case OP_SLCT:
2386 handleSLCT(i);
2387 break;
2388 case OP_AND:
2389 case OP_OR:
2390 case OP_XOR:
2391 handleLOGOP(i);
2392 break;
2393 case OP_CVT:
2394 handleCVT_NEG(i);
2395 handleCVT_CVT(i);
2396 if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
2397 handleCVT_EXTBF(i);
2398 break;
2399 case OP_SUCLAMP:
2400 handleSUCLAMP(i);
2401 break;
2402 case OP_NEG:
2403 handleNEG(i);
2404 break;
2405 case OP_EXTBF:
2406 handleEXTBF_RDSV(i);
2407 break;
2408 default:
2409 break;
2410 }
2411 }
2412
2413 return true;
2414 }
2415
2416 // =============================================================================
2417
2418 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2419 // MUL(a, b) -> a few XMADs
2420 // MAD/FMA(a, b, c) -> a few XMADs
2421 class LateAlgebraicOpt : public Pass
2422 {
2423 private:
2424 virtual bool visit(Instruction *);
2425
2426 void handleADD(Instruction *);
2427 void handleMULMAD(Instruction *);
2428 bool tryADDToSHLADD(Instruction *);
2429
2430 BuildUtil bld;
2431 };
2432
2433 void
handleADD(Instruction * add)2434 LateAlgebraicOpt::handleADD(Instruction *add)
2435 {
2436 Value *src0 = add->getSrc(0);
2437 Value *src1 = add->getSrc(1);
2438
2439 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2440 return;
2441
2442 if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
2443 tryADDToSHLADD(add);
2444 }
2445
2446 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2447 bool
tryADDToSHLADD(Instruction * add)2448 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
2449 {
2450 Value *src0 = add->getSrc(0);
2451 Value *src1 = add->getSrc(1);
2452 ImmediateValue imm;
2453 Instruction *shl;
2454 Value *src;
2455 int s;
2456
2457 if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
2458 || isFloatType(add->dType))
2459 return false;
2460
2461 if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
2462 s = 0;
2463 else
2464 if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
2465 s = 1;
2466 else
2467 return false;
2468
2469 src = add->getSrc(s);
2470 shl = src->getUniqueInsn();
2471
2472 if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
2473 return false;
2474
2475 if (!shl->src(1).getImmediate(imm))
2476 return false;
2477
2478 add->op = OP_SHLADD;
2479 add->setSrc(2, add->src(!s));
2480 // SHL can't have any modifiers, but the ADD source may have had
2481 // one. Preserve it.
2482 add->setSrc(0, shl->getSrc(0));
2483 if (s == 1)
2484 add->src(0).mod = add->src(1).mod;
2485 add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
2486 add->src(1).mod = Modifier(0);
2487
2488 return true;
2489 }
2490
2491 // MUL(a, b) -> a few XMADs
2492 // MAD/FMA(a, b, c) -> a few XMADs
2493 void
handleMULMAD(Instruction * i)2494 LateAlgebraicOpt::handleMULMAD(Instruction *i)
2495 {
2496 // TODO: handle NV50_IR_SUBOP_MUL_HIGH
2497 if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
2498 return;
2499 if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
2500 return;
2501 if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
2502 return;
2503
2504 assert(!i->src(0).mod);
2505 assert(!i->src(1).mod);
2506 assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
2507
2508 bld.setPosition(i, false);
2509
2510 Value *a = i->getSrc(0);
2511 Value *b = i->getSrc(1);
2512 Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
2513
2514 Value *tmp0 = bld.getSSA();
2515 Value *tmp1 = bld.getSSA();
2516
2517 Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
2518 insn->setPredicate(i->cc, i->getPredicate());
2519
2520 insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
2521 insn->setPredicate(i->cc, i->getPredicate());
2522 insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
2523
2524 Value *pred = i->getPredicate();
2525 i->setPredicate(i->cc, NULL);
2526
2527 i->op = OP_XMAD;
2528 i->setSrc(0, b);
2529 i->setSrc(1, tmp1);
2530 i->setSrc(2, tmp0);
2531 i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
2532 i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
2533
2534 i->setPredicate(i->cc, pred);
2535 }
2536
2537 bool
visit(Instruction * i)2538 LateAlgebraicOpt::visit(Instruction *i)
2539 {
2540 switch (i->op) {
2541 case OP_ADD:
2542 handleADD(i);
2543 break;
2544 case OP_MUL:
2545 case OP_MAD:
2546 case OP_FMA:
2547 handleMULMAD(i);
2548 break;
2549 default:
2550 break;
2551 }
2552
2553 return true;
2554 }
2555
2556 // =============================================================================
2557
2558 // Split 64-bit MUL and MAD
2559 class Split64BitOpPreRA : public Pass
2560 {
2561 private:
2562 virtual bool visit(BasicBlock *);
2563 void split64MulMad(Function *, Instruction *, DataType);
2564
2565 BuildUtil bld;
2566 };
2567
2568 bool
visit(BasicBlock * bb)2569 Split64BitOpPreRA::visit(BasicBlock *bb)
2570 {
2571 Instruction *i, *next;
2572 Modifier mod;
2573
2574 for (i = bb->getEntry(); i; i = next) {
2575 next = i->next;
2576
2577 DataType hTy;
2578 switch (i->dType) {
2579 case TYPE_U64: hTy = TYPE_U32; break;
2580 case TYPE_S64: hTy = TYPE_S32; break;
2581 default:
2582 continue;
2583 }
2584
2585 if (i->op == OP_MAD || i->op == OP_MUL)
2586 split64MulMad(func, i, hTy);
2587 }
2588
2589 return true;
2590 }
2591
2592 void
split64MulMad(Function * fn,Instruction * i,DataType hTy)2593 Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
2594 {
2595 assert(i->op == OP_MAD || i->op == OP_MUL);
2596 assert(!isFloatType(i->dType) && !isFloatType(i->sType));
2597 assert(typeSizeof(hTy) == 4);
2598
2599 bld.setPosition(i, true);
2600
2601 Value *zero = bld.mkImm(0u);
2602 Value *carry = bld.getSSA(1, FILE_FLAGS);
2603
2604 // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
2605 // values (a, b and c might be 32-bit values), using 32-bit operations. This
2606 // gives the following operations:
2607 // * `d.low = low(a.low * b.low) (+ c.low)?`
2608 // * `d.high = low(a.high * b.low) + low(a.low * b.high)
2609 // + high(a.low * b.low) (+ c.high)?`
2610 //
2611 // To compute the high bits, we can split in the following operations:
2612 // * `tmp1 = low(a.high * b.low) (+ c.high)?`
2613 // * `tmp2 = low(a.low * b.high) + tmp1`
2614 // * `d.high = high(a.low * b.low) + tmp2`
2615 //
2616 // mkSplit put lower bits at index 0 and higher bits at index 1
2617
2618 Value *op1[2];
2619 if (i->getSrc(0)->reg.size == 8)
2620 bld.mkSplit(op1, 4, i->getSrc(0));
2621 else {
2622 op1[0] = i->getSrc(0);
2623 op1[1] = zero;
2624 }
2625 Value *op2[2];
2626 if (i->getSrc(1)->reg.size == 8)
2627 bld.mkSplit(op2, 4, i->getSrc(1));
2628 else {
2629 op2[0] = i->getSrc(1);
2630 op2[1] = zero;
2631 }
2632
2633 Value *op3[2] = { NULL, NULL };
2634 if (i->op == OP_MAD) {
2635 if (i->getSrc(2)->reg.size == 8)
2636 bld.mkSplit(op3, 4, i->getSrc(2));
2637 else {
2638 op3[0] = i->getSrc(2);
2639 op3[1] = zero;
2640 }
2641 }
2642
2643 Value *tmpRes1Hi = bld.getSSA();
2644 if (i->op == OP_MAD)
2645 bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
2646 else
2647 bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
2648
2649 Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
2650
2651 Value *def[2] = { bld.getSSA(), bld.getSSA() };
2652
2653 // If it was a MAD, add the carry from the low bits
2654 // It is not needed if it was a MUL, since we added high(a.low * b.low) to
2655 // d.high
2656 if (i->op == OP_MAD)
2657 bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
2658 else
2659 bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
2660
2661 Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
2662 hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
2663 if (i->op == OP_MAD)
2664 hiPart3->setFlagsSrc(3, carry);
2665
2666 bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
2667
2668 delete_Instruction(fn->getProgram(), i);
2669 }
2670
2671 // =============================================================================
2672
2673 static inline void
updateLdStOffset(Instruction * ldst,int32_t offset,Function * fn)2674 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
2675 {
2676 if (offset != ldst->getSrc(0)->reg.data.offset) {
2677 if (ldst->getSrc(0)->refCount() > 1)
2678 ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
2679 ldst->getSrc(0)->reg.data.offset = offset;
2680 }
2681 }
2682
2683 // Combine loads and stores, forward stores to loads where possible.
2684 class MemoryOpt : public Pass
2685 {
2686 private:
2687 class Record
2688 {
2689 public:
2690 Record *next;
2691 Instruction *insn;
2692 const Value *rel[2];
2693 const Value *base;
2694 int32_t offset;
2695 int8_t fileIndex;
2696 uint8_t size;
2697 bool locked;
2698 Record *prev;
2699
2700 bool overlaps(const Instruction *ldst) const;
2701
2702 inline void link(Record **);
2703 inline void unlink(Record **);
2704 inline void set(const Instruction *ldst);
2705 };
2706
2707 public:
2708 MemoryOpt();
2709
2710 Record *loads[DATA_FILE_COUNT];
2711 Record *stores[DATA_FILE_COUNT];
2712
2713 MemoryPool recordPool;
2714
2715 private:
2716 virtual bool visit(BasicBlock *);
2717 bool runOpt(BasicBlock *);
2718
2719 Record **getList(const Instruction *);
2720
2721 Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2722
2723 // merge @insn into load/store instruction from @rec
2724 bool combineLd(Record *rec, Instruction *ld);
2725 bool combineSt(Record *rec, Instruction *st);
2726
2727 bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2728 bool replaceLdFromSt(Instruction *ld, Record *stRec);
2729 bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2730
2731 void addRecord(Instruction *ldst);
2732 void purgeRecords(Instruction *const st, DataFile);
2733 void lockStores(Instruction *const ld);
2734 void reset();
2735
2736 private:
2737 Record *prevRecord;
2738 };
2739
MemoryOpt()2740 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2741 {
2742 for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2743 loads[i] = NULL;
2744 stores[i] = NULL;
2745 }
2746 prevRecord = NULL;
2747 }
2748
2749 void
reset()2750 MemoryOpt::reset()
2751 {
2752 for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2753 Record *it, *next;
2754 for (it = loads[i]; it; it = next) {
2755 next = it->next;
2756 recordPool.release(it);
2757 }
2758 loads[i] = NULL;
2759 for (it = stores[i]; it; it = next) {
2760 next = it->next;
2761 recordPool.release(it);
2762 }
2763 stores[i] = NULL;
2764 }
2765 }
2766
2767 bool
combineLd(Record * rec,Instruction * ld)2768 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2769 {
2770 int32_t offRc = rec->offset;
2771 int32_t offLd = ld->getSrc(0)->reg.data.offset;
2772 int sizeRc = rec->size;
2773 int sizeLd = typeSizeof(ld->dType);
2774 int size = sizeRc + sizeLd;
2775 int d, j;
2776
2777 if (!prog->getTarget()->
2778 isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2779 return false;
2780 // no unaligned loads
2781 if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2782 ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2783 return false;
2784 // for compute indirect loads are not guaranteed to be aligned
2785 if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2786 return false;
2787
2788 assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2789
2790 // lock any stores that overlap with the load being merged into the
2791 // existing record.
2792 lockStores(ld);
2793
2794 for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2795
2796 if (offLd < offRc) {
2797 int sz;
2798 for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2799 // d: nr of definitions in ld
2800 // j: nr of definitions in rec->insn, move:
2801 for (d = d + j - 1; j > 0; --j, --d)
2802 rec->insn->setDef(d, rec->insn->getDef(j - 1));
2803
2804 if (rec->insn->getSrc(0)->refCount() > 1)
2805 rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2806 rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2807
2808 d = 0;
2809 } else {
2810 d = j;
2811 }
2812 // move definitions of @ld to @rec->insn
2813 for (j = 0; sizeLd; ++j, ++d) {
2814 sizeLd -= ld->getDef(j)->reg.size;
2815 rec->insn->setDef(d, ld->getDef(j));
2816 }
2817
2818 rec->size = size;
2819 rec->insn->getSrc(0)->reg.size = size;
2820 rec->insn->setType(typeOfSize(size));
2821
2822 delete_Instruction(prog, ld);
2823
2824 return true;
2825 }
2826
2827 bool
combineSt(Record * rec,Instruction * st)2828 MemoryOpt::combineSt(Record *rec, Instruction *st)
2829 {
2830 int32_t offRc = rec->offset;
2831 int32_t offSt = st->getSrc(0)->reg.data.offset;
2832 int sizeRc = rec->size;
2833 int sizeSt = typeSizeof(st->dType);
2834 int s = sizeSt / 4;
2835 int size = sizeRc + sizeSt;
2836 int j, k;
2837 Value *src[4]; // no modifiers in ValueRef allowed for st
2838 Value *extra[3];
2839
2840 if (!prog->getTarget()->
2841 isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2842 return false;
2843 // no unaligned stores
2844 if (size == 8 && MIN2(offRc, offSt) & 0x7)
2845 return false;
2846 // for compute indirect stores are not guaranteed to be aligned
2847 if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2848 return false;
2849
2850 // There's really no great place to put this in a generic manner. Seemingly
2851 // wide stores at 0x60 don't work in GS shaders on SM50+. Don't combine
2852 // those.
2853 if (prog->getTarget()->getChipset() >= NVISA_GM107_CHIPSET &&
2854 prog->getType() == Program::TYPE_GEOMETRY &&
2855 st->getSrc(0)->reg.file == FILE_SHADER_OUTPUT &&
2856 rec->rel[0] == NULL &&
2857 MIN2(offRc, offSt) == 0x60)
2858 return false;
2859
2860 // remove any existing load/store records for the store being merged into
2861 // the existing record.
2862 purgeRecords(st, DATA_FILE_COUNT);
2863
2864 st->takeExtraSources(0, extra); // save predicate and indirect address
2865
2866 if (offRc < offSt) {
2867 // save values from @st
2868 for (s = 0; sizeSt; ++s) {
2869 sizeSt -= st->getSrc(s + 1)->reg.size;
2870 src[s] = st->getSrc(s + 1);
2871 }
2872 // set record's values as low sources of @st
2873 for (j = 1; sizeRc; ++j) {
2874 sizeRc -= rec->insn->getSrc(j)->reg.size;
2875 st->setSrc(j, rec->insn->getSrc(j));
2876 }
2877 // set saved values as high sources of @st
2878 for (k = j, j = 0; j < s; ++j)
2879 st->setSrc(k++, src[j]);
2880
2881 updateLdStOffset(st, offRc, func);
2882 } else {
2883 for (j = 1; sizeSt; ++j)
2884 sizeSt -= st->getSrc(j)->reg.size;
2885 for (s = 1; sizeRc; ++j, ++s) {
2886 sizeRc -= rec->insn->getSrc(s)->reg.size;
2887 st->setSrc(j, rec->insn->getSrc(s));
2888 }
2889 rec->offset = offSt;
2890 }
2891 st->putExtraSources(0, extra); // restore pointer and predicate
2892
2893 delete_Instruction(prog, rec->insn);
2894 rec->insn = st;
2895 rec->size = size;
2896 rec->insn->getSrc(0)->reg.size = size;
2897 rec->insn->setType(typeOfSize(size));
2898 return true;
2899 }
2900
2901 void
set(const Instruction * ldst)2902 MemoryOpt::Record::set(const Instruction *ldst)
2903 {
2904 const Symbol *mem = ldst->getSrc(0)->asSym();
2905 fileIndex = mem->reg.fileIndex;
2906 rel[0] = ldst->getIndirect(0, 0);
2907 rel[1] = ldst->getIndirect(0, 1);
2908 offset = mem->reg.data.offset;
2909 base = mem->getBase();
2910 size = typeSizeof(ldst->sType);
2911 }
2912
2913 void
link(Record ** list)2914 MemoryOpt::Record::link(Record **list)
2915 {
2916 next = *list;
2917 if (next)
2918 next->prev = this;
2919 prev = NULL;
2920 *list = this;
2921 }
2922
2923 void
unlink(Record ** list)2924 MemoryOpt::Record::unlink(Record **list)
2925 {
2926 if (next)
2927 next->prev = prev;
2928 if (prev)
2929 prev->next = next;
2930 else
2931 *list = next;
2932 }
2933
2934 MemoryOpt::Record **
getList(const Instruction * insn)2935 MemoryOpt::getList(const Instruction *insn)
2936 {
2937 if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2938 return &loads[insn->src(0).getFile()];
2939 return &stores[insn->src(0).getFile()];
2940 }
2941
2942 void
addRecord(Instruction * i)2943 MemoryOpt::addRecord(Instruction *i)
2944 {
2945 Record **list = getList(i);
2946 Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2947
2948 it->link(list);
2949 it->set(i);
2950 it->insn = i;
2951 it->locked = false;
2952 }
2953
2954 MemoryOpt::Record *
findRecord(const Instruction * insn,bool load,bool & isAdj) const2955 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2956 {
2957 const Symbol *sym = insn->getSrc(0)->asSym();
2958 const int size = typeSizeof(insn->sType);
2959 Record *rec = NULL;
2960 Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2961
2962 for (; it; it = it->next) {
2963 if (it->locked && insn->op != OP_LOAD && insn->op != OP_VFETCH)
2964 continue;
2965 if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2966 it->rel[0] != insn->getIndirect(0, 0) ||
2967 it->fileIndex != sym->reg.fileIndex ||
2968 it->rel[1] != insn->getIndirect(0, 1))
2969 continue;
2970
2971 if (it->offset < sym->reg.data.offset) {
2972 if (it->offset + it->size >= sym->reg.data.offset) {
2973 isAdj = (it->offset + it->size == sym->reg.data.offset);
2974 if (!isAdj)
2975 return it;
2976 if (!(it->offset & 0x7))
2977 rec = it;
2978 }
2979 } else {
2980 isAdj = it->offset != sym->reg.data.offset;
2981 if (size <= it->size && !isAdj)
2982 return it;
2983 else
2984 if (!(sym->reg.data.offset & 0x7))
2985 if (it->offset - size <= sym->reg.data.offset)
2986 rec = it;
2987 }
2988 }
2989 return rec;
2990 }
2991
2992 bool
replaceLdFromSt(Instruction * ld,Record * rec)2993 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2994 {
2995 Instruction *st = rec->insn;
2996 int32_t offSt = rec->offset;
2997 int32_t offLd = ld->getSrc(0)->reg.data.offset;
2998 int d, s;
2999
3000 for (s = 1; offSt != offLd && st->srcExists(s); ++s)
3001 offSt += st->getSrc(s)->reg.size;
3002 if (offSt != offLd)
3003 return false;
3004
3005 for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
3006 if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
3007 return false;
3008 if (st->getSrc(s)->reg.file != FILE_GPR)
3009 return false;
3010 ld->def(d).replace(st->src(s), false);
3011 }
3012 ld->bb->remove(ld);
3013 return true;
3014 }
3015
3016 bool
replaceLdFromLd(Instruction * ldE,Record * rec)3017 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
3018 {
3019 Instruction *ldR = rec->insn;
3020 int32_t offR = rec->offset;
3021 int32_t offE = ldE->getSrc(0)->reg.data.offset;
3022 int dR, dE;
3023
3024 assert(offR <= offE);
3025 for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
3026 offR += ldR->getDef(dR)->reg.size;
3027 if (offR != offE)
3028 return false;
3029
3030 for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
3031 if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
3032 return false;
3033 ldE->def(dE).replace(ldR->getDef(dR), false);
3034 }
3035
3036 delete_Instruction(prog, ldE);
3037 return true;
3038 }
3039
3040 bool
replaceStFromSt(Instruction * restrict st,Record * rec)3041 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
3042 {
3043 const Instruction *const ri = rec->insn;
3044 Value *extra[3];
3045
3046 int32_t offS = st->getSrc(0)->reg.data.offset;
3047 int32_t offR = rec->offset;
3048 int32_t endS = offS + typeSizeof(st->dType);
3049 int32_t endR = offR + typeSizeof(ri->dType);
3050
3051 rec->size = MAX2(endS, endR) - MIN2(offS, offR);
3052
3053 st->takeExtraSources(0, extra);
3054
3055 if (offR < offS) {
3056 Value *vals[10];
3057 int s, n;
3058 int k = 0;
3059 // get non-replaced sources of ri
3060 for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
3061 vals[k++] = ri->getSrc(s);
3062 n = s;
3063 // get replaced sources of st
3064 for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
3065 vals[k++] = st->getSrc(s);
3066 // skip replaced sources of ri
3067 for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
3068 // get non-replaced sources after values covered by st
3069 for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
3070 vals[k++] = ri->getSrc(s);
3071 assert((unsigned int)k <= ARRAY_SIZE(vals));
3072 for (s = 0; s < k; ++s)
3073 st->setSrc(s + 1, vals[s]);
3074 st->setSrc(0, ri->getSrc(0));
3075 } else
3076 if (endR > endS) {
3077 int j, s;
3078 for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
3079 for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
3080 for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
3081 st->setSrc(s++, ri->getSrc(j));
3082 }
3083 st->putExtraSources(0, extra);
3084
3085 delete_Instruction(prog, rec->insn);
3086
3087 rec->insn = st;
3088 rec->offset = st->getSrc(0)->reg.data.offset;
3089
3090 st->setType(typeOfSize(rec->size));
3091
3092 return true;
3093 }
3094
3095 bool
overlaps(const Instruction * ldst) const3096 MemoryOpt::Record::overlaps(const Instruction *ldst) const
3097 {
3098 Record that;
3099 that.set(ldst);
3100
3101 // This assumes that images/buffers can't overlap. They can.
3102 // TODO: Plumb the restrict logic through, and only skip when it's a
3103 // restrict situation, or there can implicitly be no writes.
3104 if (this->fileIndex != that.fileIndex && this->rel[1] == that.rel[1])
3105 return false;
3106
3107 if (this->rel[0] || that.rel[0])
3108 return this->base == that.base;
3109
3110 return
3111 (this->offset < that.offset + that.size) &&
3112 (this->offset + this->size > that.offset);
3113 }
3114
3115 // We must not eliminate stores that affect the result of @ld if
3116 // we find later stores to the same location, and we may no longer
3117 // merge them with later stores.
3118 // The stored value can, however, still be used to determine the value
3119 // returned by future loads.
3120 void
lockStores(Instruction * const ld)3121 MemoryOpt::lockStores(Instruction *const ld)
3122 {
3123 for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
3124 if (!r->locked && r->overlaps(ld))
3125 r->locked = true;
3126 }
3127
3128 // Prior loads from the location of @st are no longer valid.
3129 // Stores to the location of @st may no longer be used to derive
3130 // the value at it nor be coalesced into later stores.
3131 void
purgeRecords(Instruction * const st,DataFile f)3132 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
3133 {
3134 if (st)
3135 f = st->src(0).getFile();
3136
3137 for (Record *r = loads[f]; r; r = r->next)
3138 if (!st || r->overlaps(st))
3139 r->unlink(&loads[f]);
3140
3141 for (Record *r = stores[f]; r; r = r->next)
3142 if (!st || r->overlaps(st))
3143 r->unlink(&stores[f]);
3144 }
3145
3146 bool
visit(BasicBlock * bb)3147 MemoryOpt::visit(BasicBlock *bb)
3148 {
3149 bool ret = runOpt(bb);
3150 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
3151 // where 96 bit memory operations are forbidden.
3152 if (ret)
3153 ret = runOpt(bb);
3154 return ret;
3155 }
3156
3157 bool
runOpt(BasicBlock * bb)3158 MemoryOpt::runOpt(BasicBlock *bb)
3159 {
3160 Instruction *ldst, *next;
3161 Record *rec;
3162 bool isAdjacent = true;
3163
3164 for (ldst = bb->getEntry(); ldst; ldst = next) {
3165 bool keep = true;
3166 bool isLoad = true;
3167 next = ldst->next;
3168
3169 // TODO: Handle combining sub 4-bytes loads/stores.
3170 if (ldst->op == OP_STORE && typeSizeof(ldst->dType) < 4) {
3171 purgeRecords(ldst, ldst->src(0).getFile());
3172 continue;
3173 }
3174
3175 if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
3176 if (ldst->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3177 purgeRecords(ldst, ldst->src(0).getFile());
3178 continue;
3179 }
3180 if (ldst->isDead()) {
3181 // might have been produced by earlier optimization
3182 delete_Instruction(prog, ldst);
3183 continue;
3184 }
3185 } else
3186 if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
3187 if (ldst->subOp == NV50_IR_SUBOP_STORE_UNLOCKED) {
3188 purgeRecords(ldst, ldst->src(0).getFile());
3189 continue;
3190 }
3191 if (typeSizeof(ldst->dType) == 4 &&
3192 ldst->src(1).getFile() == FILE_GPR &&
3193 ldst->getSrc(1)->getInsn()->op == OP_NOP) {
3194 delete_Instruction(prog, ldst);
3195 continue;
3196 }
3197 isLoad = false;
3198 } else {
3199 // TODO: maybe have all fixed ops act as barrier ?
3200 if (ldst->op == OP_CALL ||
3201 ldst->op == OP_BAR ||
3202 ldst->op == OP_MEMBAR) {
3203 purgeRecords(NULL, FILE_MEMORY_LOCAL);
3204 purgeRecords(NULL, FILE_MEMORY_GLOBAL);
3205 purgeRecords(NULL, FILE_MEMORY_SHARED);
3206 purgeRecords(NULL, FILE_SHADER_OUTPUT);
3207 } else
3208 if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
3209 if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
3210 purgeRecords(NULL, FILE_MEMORY_LOCAL);
3211 purgeRecords(NULL, FILE_MEMORY_GLOBAL);
3212 purgeRecords(NULL, FILE_MEMORY_SHARED);
3213 } else {
3214 purgeRecords(NULL, ldst->src(0).getFile());
3215 }
3216 } else
3217 if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
3218 purgeRecords(NULL, FILE_SHADER_OUTPUT);
3219 }
3220 continue;
3221 }
3222 if (ldst->getPredicate()) // TODO: handle predicated ld/st
3223 continue;
3224 if (ldst->perPatch) // TODO: create separate per-patch lists
3225 continue;
3226
3227 if (isLoad) {
3228 DataFile file = ldst->src(0).getFile();
3229
3230 // if ld l[]/g[] look for previous store to eliminate the reload
3231 if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
3232 // TODO: shared memory ?
3233 rec = findRecord(ldst, false, isAdjacent);
3234 if (rec && !isAdjacent)
3235 keep = !replaceLdFromSt(ldst, rec);
3236 }
3237
3238 // or look for ld from the same location and replace this one
3239 rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
3240 if (rec) {
3241 if (!isAdjacent)
3242 keep = !replaceLdFromLd(ldst, rec);
3243 else
3244 // or combine a previous load with this one
3245 keep = !combineLd(rec, ldst);
3246 }
3247 if (keep)
3248 lockStores(ldst);
3249 } else {
3250 rec = findRecord(ldst, false, isAdjacent);
3251 if (rec) {
3252 if (!isAdjacent)
3253 keep = !replaceStFromSt(ldst, rec);
3254 else
3255 keep = !combineSt(rec, ldst);
3256 }
3257 if (keep)
3258 purgeRecords(ldst, DATA_FILE_COUNT);
3259 }
3260 if (keep)
3261 addRecord(ldst);
3262 }
3263 reset();
3264
3265 return true;
3266 }
3267
3268 // =============================================================================
3269
3270 // Turn control flow into predicated instructions (after register allocation !).
3271 // TODO:
3272 // Could move this to before register allocation on NVC0 and also handle nested
3273 // constructs.
3274 class FlatteningPass : public Pass
3275 {
3276 public:
FlatteningPass()3277 FlatteningPass() : gpr_unit(0) {}
3278
3279 private:
3280 virtual bool visit(Function *);
3281 virtual bool visit(BasicBlock *);
3282
3283 bool tryPredicateConditional(BasicBlock *);
3284 void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
3285 void tryPropagateBranch(BasicBlock *);
3286 inline bool isConstantCondition(Value *pred);
3287 inline bool mayPredicate(const Instruction *, const Value *pred) const;
3288 inline void removeFlow(Instruction *);
3289
3290 uint8_t gpr_unit;
3291 };
3292
3293 bool
isConstantCondition(Value * pred)3294 FlatteningPass::isConstantCondition(Value *pred)
3295 {
3296 Instruction *insn = pred->getUniqueInsn();
3297 assert(insn);
3298 if (insn->op != OP_SET || insn->srcExists(2))
3299 return false;
3300
3301 for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
3302 Instruction *ld = insn->getSrc(s)->getUniqueInsn();
3303 DataFile file;
3304 if (ld) {
3305 if (ld->op != OP_MOV && ld->op != OP_LOAD)
3306 return false;
3307 if (ld->src(0).isIndirect(0))
3308 return false;
3309 file = ld->src(0).getFile();
3310 } else {
3311 file = insn->src(s).getFile();
3312 // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
3313 // in register "units", which can vary between targets.
3314 if (file == FILE_GPR) {
3315 Value *v = insn->getSrc(s);
3316 int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
3317 int units = bytes >> gpr_unit;
3318 if (units > prog->maxGPR)
3319 file = FILE_IMMEDIATE;
3320 }
3321 }
3322 if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
3323 return false;
3324 }
3325 return true;
3326 }
3327
3328 void
removeFlow(Instruction * insn)3329 FlatteningPass::removeFlow(Instruction *insn)
3330 {
3331 FlowInstruction *term = insn ? insn->asFlow() : NULL;
3332 if (!term)
3333 return;
3334 Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
3335
3336 if (term->op == OP_BRA) {
3337 // TODO: this might get more difficult when we get arbitrary BRAs
3338 if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
3339 return;
3340 } else
3341 if (term->op != OP_JOIN)
3342 return;
3343
3344 Value *pred = term->getPredicate();
3345
3346 delete_Instruction(prog, term);
3347
3348 if (pred && pred->refCount() == 0) {
3349 Instruction *pSet = pred->getUniqueInsn();
3350 pred->join->reg.data.id = -1; // deallocate
3351 if (pSet->isDead())
3352 delete_Instruction(prog, pSet);
3353 }
3354 }
3355
3356 void
predicateInstructions(BasicBlock * bb,Value * pred,CondCode cc)3357 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
3358 {
3359 for (Instruction *i = bb->getEntry(); i; i = i->next) {
3360 if (i->isNop())
3361 continue;
3362 assert(!i->getPredicate());
3363 i->setPredicate(cc, pred);
3364 }
3365 removeFlow(bb->getExit());
3366 }
3367
3368 bool
mayPredicate(const Instruction * insn,const Value * pred) const3369 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
3370 {
3371 if (insn->isPseudo())
3372 return true;
3373 // TODO: calls where we don't know which registers are modified
3374
3375 if (!prog->getTarget()->mayPredicate(insn, pred))
3376 return false;
3377 for (int d = 0; insn->defExists(d); ++d)
3378 if (insn->getDef(d)->equals(pred))
3379 return false;
3380 return true;
3381 }
3382
3383 // If we jump to BRA/RET/EXIT, replace the jump with it.
3384 // NOTE: We do not update the CFG anymore here !
3385 //
3386 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
3387 // BB:0
3388 // @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
3389 // BB1:
3390 // bra BB:3
3391 // BB2:
3392 // ...
3393 // BB3:
3394 // ...
3395 void
tryPropagateBranch(BasicBlock * bb)3396 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
3397 {
3398 for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
3399 BasicBlock *bf = i->asFlow()->target.bb;
3400
3401 if (bf->getInsnCount() != 1)
3402 continue;
3403
3404 FlowInstruction *bra = i->asFlow();
3405 FlowInstruction *rep = bf->getExit()->asFlow();
3406
3407 if (!rep || rep->getPredicate())
3408 continue;
3409 if (rep->op != OP_BRA &&
3410 rep->op != OP_JOIN &&
3411 rep->op != OP_EXIT)
3412 continue;
3413
3414 // TODO: If there are multiple branches to @rep, only the first would
3415 // be replaced, so only remove them after this pass is done ?
3416 // Also, need to check all incident blocks for fall-through exits and
3417 // add the branch there.
3418 bra->op = rep->op;
3419 bra->target.bb = rep->target.bb;
3420 if (bf->cfg.incidentCount() == 1)
3421 bf->remove(rep);
3422 }
3423 }
3424
3425 bool
visit(Function * fn)3426 FlatteningPass::visit(Function *fn)
3427 {
3428 gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
3429
3430 return true;
3431 }
3432
3433 bool
visit(BasicBlock * bb)3434 FlatteningPass::visit(BasicBlock *bb)
3435 {
3436 if (tryPredicateConditional(bb))
3437 return true;
3438
3439 // try to attach join to previous instruction
3440 if (prog->getTarget()->hasJoin) {
3441 Instruction *insn = bb->getExit();
3442 if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
3443 insn = insn->prev;
3444 if (insn && !insn->getPredicate() &&
3445 !insn->asFlow() &&
3446 insn->op != OP_DISCARD &&
3447 insn->op != OP_TEXBAR &&
3448 !isTextureOp(insn->op) && // probably just nve4
3449 !isSurfaceOp(insn->op) && // not confirmed
3450 insn->op != OP_LINTERP && // probably just nve4
3451 insn->op != OP_PINTERP && // probably just nve4
3452 ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
3453 (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
3454 !insn->isNop()) {
3455 insn->join = 1;
3456 bb->remove(bb->getExit());
3457 return true;
3458 }
3459 }
3460 }
3461
3462 tryPropagateBranch(bb);
3463
3464 return true;
3465 }
3466
3467 bool
tryPredicateConditional(BasicBlock * bb)3468 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
3469 {
3470 BasicBlock *bL = NULL, *bR = NULL;
3471 unsigned int nL = 0, nR = 0, limit = 12;
3472 Instruction *insn;
3473 unsigned int mask;
3474
3475 mask = bb->initiatesSimpleConditional();
3476 if (!mask)
3477 return false;
3478
3479 assert(bb->getExit());
3480 Value *pred = bb->getExit()->getPredicate();
3481 assert(pred);
3482
3483 if (isConstantCondition(pred))
3484 limit = 4;
3485
3486 Graph::EdgeIterator ei = bb->cfg.outgoing();
3487
3488 if (mask & 1) {
3489 bL = BasicBlock::get(ei.getNode());
3490 for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
3491 if (!mayPredicate(insn, pred))
3492 return false;
3493 if (nL > limit)
3494 return false; // too long, do a real branch
3495 }
3496 ei.next();
3497
3498 if (mask & 2) {
3499 bR = BasicBlock::get(ei.getNode());
3500 for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
3501 if (!mayPredicate(insn, pred))
3502 return false;
3503 if (nR > limit)
3504 return false; // too long, do a real branch
3505 }
3506
3507 if (bL)
3508 predicateInstructions(bL, pred, bb->getExit()->cc);
3509 if (bR)
3510 predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
3511
3512 if (bb->joinAt) {
3513 bb->remove(bb->joinAt);
3514 bb->joinAt = NULL;
3515 }
3516 removeFlow(bb->getExit()); // delete the branch/join at the fork point
3517
3518 // remove potential join operations at the end of the conditional
3519 if (prog->getTarget()->joinAnterior) {
3520 bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
3521 if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
3522 removeFlow(bb->getEntry());
3523 }
3524
3525 return true;
3526 }
3527
3528 // =============================================================================
3529
3530 // Fold Immediate into MAD; must be done after register allocation due to
3531 // constraint SDST == SSRC2
3532 // TODO:
3533 // Does NVC0+ have other situations where this pass makes sense?
3534 class PostRaLoadPropagation : public Pass
3535 {
3536 private:
3537 virtual bool visit(Instruction *);
3538
3539 void handleMADforNV50(Instruction *);
3540 void handleMADforNVC0(Instruction *);
3541 };
3542
3543 static bool
post_ra_dead(Instruction * i)3544 post_ra_dead(Instruction *i)
3545 {
3546 for (int d = 0; i->defExists(d); ++d)
3547 if (i->getDef(d)->refCount())
3548 return false;
3549 return true;
3550 }
3551
3552 // Fold Immediate into MAD; must be done after register allocation due to
3553 // constraint SDST == SSRC2
3554 void
handleMADforNV50(Instruction * i)3555 PostRaLoadPropagation::handleMADforNV50(Instruction *i)
3556 {
3557 if (i->def(0).getFile() != FILE_GPR ||
3558 i->src(0).getFile() != FILE_GPR ||
3559 i->src(1).getFile() != FILE_GPR ||
3560 i->src(2).getFile() != FILE_GPR ||
3561 i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3562 return;
3563
3564 if (i->getDef(0)->reg.data.id >= 64 ||
3565 i->getSrc(0)->reg.data.id >= 64)
3566 return;
3567
3568 if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
3569 return;
3570
3571 if (i->getPredicate())
3572 return;
3573
3574 Value *vtmp;
3575 Instruction *def = i->getSrc(1)->getInsn();
3576
3577 if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
3578 def = def->getSrc(0)->getInsn();
3579 if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
3580 vtmp = i->getSrc(1);
3581 if (isFloatType(i->sType)) {
3582 i->setSrc(1, def->getSrc(0));
3583 } else {
3584 ImmediateValue val;
3585 // getImmediate() has side-effects on the argument so this *shouldn't*
3586 // be folded into the assert()
3587 ASSERTED bool ret = def->src(0).getImmediate(val);
3588 assert(ret);
3589 if (i->getSrc(1)->reg.data.id & 1)
3590 val.reg.data.u32 >>= 16;
3591 val.reg.data.u32 &= 0xffff;
3592 i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32));
3593 }
3594
3595 /* There's no post-RA dead code elimination, so do it here
3596 * XXX: if we add more code-removing post-RA passes, we might
3597 * want to create a post-RA dead-code elim pass */
3598 if (post_ra_dead(vtmp->getInsn())) {
3599 Value *src = vtmp->getInsn()->getSrc(0);
3600 // Careful -- splits will have already been removed from the
3601 // functions. Don't double-delete.
3602 if (vtmp->getInsn()->bb)
3603 delete_Instruction(prog, vtmp->getInsn());
3604 if (src->getInsn() && post_ra_dead(src->getInsn()))
3605 delete_Instruction(prog, src->getInsn());
3606 }
3607 }
3608 }
3609
3610 void
handleMADforNVC0(Instruction * i)3611 PostRaLoadPropagation::handleMADforNVC0(Instruction *i)
3612 {
3613 if (i->def(0).getFile() != FILE_GPR ||
3614 i->src(0).getFile() != FILE_GPR ||
3615 i->src(1).getFile() != FILE_GPR ||
3616 i->src(2).getFile() != FILE_GPR ||
3617 i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3618 return;
3619
3620 // TODO: gm107 can also do this for S32, maybe other chipsets as well
3621 if (i->dType != TYPE_F32)
3622 return;
3623
3624 if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3625 return;
3626
3627 ImmediateValue val;
3628 int s;
3629
3630 if (i->src(0).getImmediate(val))
3631 s = 1;
3632 else if (i->src(1).getImmediate(val))
3633 s = 0;
3634 else
3635 return;
3636
3637 if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3638 return;
3639
3640 if (s == 1)
3641 i->swapSources(0, 1);
3642
3643 Instruction *imm = i->getSrc(1)->getInsn();
3644 i->setSrc(1, imm->getSrc(0));
3645 if (post_ra_dead(imm))
3646 delete_Instruction(prog, imm);
3647 }
3648
3649 bool
visit(Instruction * i)3650 PostRaLoadPropagation::visit(Instruction *i)
3651 {
3652 switch (i->op) {
3653 case OP_FMA:
3654 case OP_MAD:
3655 if (prog->getTarget()->getChipset() < 0xc0)
3656 handleMADforNV50(i);
3657 else
3658 handleMADforNVC0(i);
3659 break;
3660 default:
3661 break;
3662 }
3663
3664 return true;
3665 }
3666
3667 // =============================================================================
3668
3669 // Common subexpression elimination. Stupid O^2 implementation.
3670 class LocalCSE : public Pass
3671 {
3672 private:
3673 virtual bool visit(BasicBlock *);
3674
3675 inline bool tryReplace(Instruction **, Instruction *);
3676
3677 DLList ops[OP_LAST + 1];
3678 };
3679
3680 class GlobalCSE : public Pass
3681 {
3682 private:
3683 virtual bool visit(BasicBlock *);
3684 };
3685
3686 bool
isActionEqual(const Instruction * that) const3687 Instruction::isActionEqual(const Instruction *that) const
3688 {
3689 if (this->op != that->op ||
3690 this->dType != that->dType ||
3691 this->sType != that->sType)
3692 return false;
3693 if (this->cc != that->cc)
3694 return false;
3695
3696 if (this->asTex()) {
3697 if (memcmp(&this->asTex()->tex,
3698 &that->asTex()->tex,
3699 sizeof(this->asTex()->tex)))
3700 return false;
3701 } else
3702 if (this->asCmp()) {
3703 if (this->asCmp()->setCond != that->asCmp()->setCond)
3704 return false;
3705 } else
3706 if (this->asFlow()) {
3707 return false;
3708 } else
3709 if (this->op == OP_PHI && this->bb != that->bb) {
3710 /* TODO: we could probably be a bit smarter here by following the
3711 * control flow, but honestly, it is quite painful to check */
3712 return false;
3713 } else {
3714 if (this->ipa != that->ipa ||
3715 this->lanes != that->lanes ||
3716 this->perPatch != that->perPatch)
3717 return false;
3718 if (this->postFactor != that->postFactor)
3719 return false;
3720 }
3721
3722 if (this->subOp != that->subOp ||
3723 this->saturate != that->saturate ||
3724 this->rnd != that->rnd ||
3725 this->ftz != that->ftz ||
3726 this->dnz != that->dnz ||
3727 this->cache != that->cache ||
3728 this->mask != that->mask)
3729 return false;
3730
3731 return true;
3732 }
3733
3734 bool
isResultEqual(const Instruction * that) const3735 Instruction::isResultEqual(const Instruction *that) const
3736 {
3737 unsigned int d, s;
3738
3739 // NOTE: location of discard only affects tex with liveOnly and quadops
3740 if (!this->defExists(0) && this->op != OP_DISCARD)
3741 return false;
3742
3743 if (!isActionEqual(that))
3744 return false;
3745
3746 if (this->predSrc != that->predSrc)
3747 return false;
3748
3749 for (d = 0; this->defExists(d); ++d) {
3750 if (!that->defExists(d) ||
3751 !this->getDef(d)->equals(that->getDef(d), false))
3752 return false;
3753 }
3754 if (that->defExists(d))
3755 return false;
3756
3757 for (s = 0; this->srcExists(s); ++s) {
3758 if (!that->srcExists(s))
3759 return false;
3760 if (this->src(s).mod != that->src(s).mod)
3761 return false;
3762 if (!this->getSrc(s)->equals(that->getSrc(s), true))
3763 return false;
3764 }
3765 if (that->srcExists(s))
3766 return false;
3767
3768 if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
3769 switch (src(0).getFile()) {
3770 case FILE_MEMORY_CONST:
3771 case FILE_SHADER_INPUT:
3772 return true;
3773 case FILE_SHADER_OUTPUT:
3774 return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
3775 default:
3776 return false;
3777 }
3778 }
3779
3780 return true;
3781 }
3782
3783 // pull through common expressions from different in-blocks
3784 bool
visit(BasicBlock * bb)3785 GlobalCSE::visit(BasicBlock *bb)
3786 {
3787 Instruction *phi, *next, *ik;
3788 int s;
3789
3790 // TODO: maybe do this with OP_UNION, too
3791
3792 for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
3793 next = phi->next;
3794 if (phi->getSrc(0)->refCount() > 1)
3795 continue;
3796 ik = phi->getSrc(0)->getInsn();
3797 if (!ik)
3798 continue; // probably a function input
3799 if (ik->defCount(0xff) > 1)
3800 continue; // too painful to check if we can really push this forward
3801 for (s = 1; phi->srcExists(s); ++s) {
3802 if (phi->getSrc(s)->refCount() > 1)
3803 break;
3804 if (!phi->getSrc(s)->getInsn() ||
3805 !phi->getSrc(s)->getInsn()->isResultEqual(ik))
3806 break;
3807 }
3808 if (!phi->srcExists(s)) {
3809 assert(ik->op != OP_PHI);
3810 Instruction *entry = bb->getEntry();
3811 ik->bb->remove(ik);
3812 if (!entry || entry->op != OP_JOIN)
3813 bb->insertHead(ik);
3814 else
3815 bb->insertAfter(entry, ik);
3816 ik->setDef(0, phi->getDef(0));
3817 delete_Instruction(prog, phi);
3818 }
3819 }
3820
3821 return true;
3822 }
3823
3824 bool
tryReplace(Instruction ** ptr,Instruction * i)3825 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
3826 {
3827 Instruction *old = *ptr;
3828
3829 // TODO: maybe relax this later (causes trouble with OP_UNION)
3830 if (i->isPredicated())
3831 return false;
3832
3833 if (!old->isResultEqual(i))
3834 return false;
3835
3836 for (int d = 0; old->defExists(d); ++d)
3837 old->def(d).replace(i->getDef(d), false);
3838 delete_Instruction(prog, old);
3839 *ptr = NULL;
3840 return true;
3841 }
3842
3843 bool
visit(BasicBlock * bb)3844 LocalCSE::visit(BasicBlock *bb)
3845 {
3846 unsigned int replaced;
3847
3848 do {
3849 Instruction *ir, *next;
3850
3851 replaced = 0;
3852
3853 // will need to know the order of instructions
3854 int serial = 0;
3855 for (ir = bb->getFirst(); ir; ir = ir->next)
3856 ir->serial = serial++;
3857
3858 for (ir = bb->getFirst(); ir; ir = next) {
3859 int s;
3860 Value *src = NULL;
3861
3862 next = ir->next;
3863
3864 if (ir->fixed) {
3865 ops[ir->op].insert(ir);
3866 continue;
3867 }
3868
3869 for (s = 0; ir->srcExists(s); ++s)
3870 if (ir->getSrc(s)->asLValue())
3871 if (!src || ir->getSrc(s)->refCount() < src->refCount())
3872 src = ir->getSrc(s);
3873
3874 if (src) {
3875 for (Value::UseIterator it = src->uses.begin();
3876 it != src->uses.end(); ++it) {
3877 Instruction *ik = (*it)->getInsn();
3878 if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3879 if (tryReplace(&ir, ik))
3880 break;
3881 }
3882 } else {
3883 DLLIST_FOR_EACH(&ops[ir->op], iter)
3884 {
3885 Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3886 if (tryReplace(&ir, ik))
3887 break;
3888 }
3889 }
3890
3891 if (ir)
3892 ops[ir->op].insert(ir);
3893 else
3894 ++replaced;
3895 }
3896 for (unsigned int i = 0; i <= OP_LAST; ++i)
3897 ops[i].clear();
3898
3899 } while (replaced);
3900
3901 return true;
3902 }
3903
3904 // =============================================================================
3905
3906 // Remove computations of unused values.
3907 class DeadCodeElim : public Pass
3908 {
3909 public:
DeadCodeElim()3910 DeadCodeElim() : deadCount(0) {}
3911 bool buryAll(Program *);
3912
3913 private:
3914 virtual bool visit(BasicBlock *);
3915
3916 void checkSplitLoad(Instruction *ld); // for partially dead loads
3917
3918 unsigned int deadCount;
3919 };
3920
3921 bool
buryAll(Program * prog)3922 DeadCodeElim::buryAll(Program *prog)
3923 {
3924 do {
3925 deadCount = 0;
3926 if (!this->run(prog, false, false))
3927 return false;
3928 } while (deadCount);
3929
3930 return true;
3931 }
3932
3933 bool
visit(BasicBlock * bb)3934 DeadCodeElim::visit(BasicBlock *bb)
3935 {
3936 Instruction *prev;
3937
3938 for (Instruction *i = bb->getExit(); i; i = prev) {
3939 prev = i->prev;
3940 if (i->isDead()) {
3941 ++deadCount;
3942 delete_Instruction(prog, i);
3943 } else
3944 if (i->defExists(1) &&
3945 i->subOp == 0 &&
3946 (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3947 checkSplitLoad(i);
3948 } else
3949 if (i->defExists(0) && !i->getDef(0)->refCount()) {
3950 if (i->op == OP_ATOM ||
3951 i->op == OP_SUREDP ||
3952 i->op == OP_SUREDB) {
3953 const Target *targ = prog->getTarget();
3954 if (targ->getChipset() >= NVISA_GF100_CHIPSET ||
3955 i->subOp != NV50_IR_SUBOP_ATOM_CAS)
3956 i->setDef(0, NULL);
3957 if (i->op == OP_ATOM && i->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
3958 i->cache = CACHE_CV;
3959 i->op = OP_STORE;
3960 i->subOp = 0;
3961 }
3962 } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3963 i->setDef(0, i->getDef(1));
3964 i->setDef(1, NULL);
3965 }
3966 }
3967 }
3968 return true;
3969 }
3970
3971 // Each load can go into up to 4 destinations, any of which might potentially
3972 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3973 // of where the holes are. We find the first contiguous region, put it into
3974 // the first load, and then put the second contiguous region into the second
3975 // load. There can be at most 2 contiguous regions.
3976 //
3977 // Note that there are some restrictions, for example it's not possible to do
3978 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3979 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3980 // split into a 64-bit and 32-bit load.
3981 void
checkSplitLoad(Instruction * ld1)3982 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3983 {
3984 Instruction *ld2 = NULL; // can get at most 2 loads
3985 Value *def1[4];
3986 Value *def2[4];
3987 int32_t addr1, addr2;
3988 int32_t size1, size2;
3989 int d, n1, n2;
3990 uint32_t mask = 0xffffffff;
3991
3992 for (d = 0; ld1->defExists(d); ++d)
3993 if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
3994 mask &= ~(1 << d);
3995 if (mask == 0xffffffff)
3996 return;
3997
3998 addr1 = ld1->getSrc(0)->reg.data.offset;
3999 n1 = n2 = 0;
4000 size1 = size2 = 0;
4001
4002 // Compute address/width for first load
4003 for (d = 0; ld1->defExists(d); ++d) {
4004 if (mask & (1 << d)) {
4005 if (size1 && (addr1 & 0x7))
4006 break;
4007 def1[n1] = ld1->getDef(d);
4008 size1 += def1[n1++]->reg.size;
4009 } else
4010 if (!n1) {
4011 addr1 += ld1->getDef(d)->reg.size;
4012 } else {
4013 break;
4014 }
4015 }
4016
4017 // Scale back the size of the first load until it can be loaded. This
4018 // typically happens for TYPE_B96 loads.
4019 while (n1 &&
4020 !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
4021 typeOfSize(size1))) {
4022 size1 -= def1[--n1]->reg.size;
4023 d--;
4024 }
4025
4026 // Compute address/width for second load
4027 for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
4028 if (mask & (1 << d)) {
4029 assert(!size2 || !(addr2 & 0x7));
4030 def2[n2] = ld1->getDef(d);
4031 size2 += def2[n2++]->reg.size;
4032 } else if (!n2) {
4033 assert(!n2);
4034 addr2 += ld1->getDef(d)->reg.size;
4035 } else {
4036 break;
4037 }
4038 }
4039
4040 // Make sure that we've processed all the values
4041 for (; ld1->defExists(d); ++d)
4042 assert(!(mask & (1 << d)));
4043
4044 updateLdStOffset(ld1, addr1, func);
4045 ld1->setType(typeOfSize(size1));
4046 for (d = 0; d < 4; ++d)
4047 ld1->setDef(d, (d < n1) ? def1[d] : NULL);
4048
4049 if (!n2)
4050 return;
4051
4052 ld2 = cloneShallow(func, ld1);
4053 updateLdStOffset(ld2, addr2, func);
4054 ld2->setType(typeOfSize(size2));
4055 for (d = 0; d < 4; ++d)
4056 ld2->setDef(d, (d < n2) ? def2[d] : NULL);
4057
4058 ld1->bb->insertAfter(ld1, ld2);
4059 }
4060
4061 // =============================================================================
4062
4063 #define RUN_PASS(l, n, f) \
4064 if (level >= (l)) { \
4065 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
4066 INFO("PEEPHOLE: %s\n", #n); \
4067 n pass; \
4068 if (!pass.f(this)) \
4069 return false; \
4070 }
4071
4072 bool
optimizeSSA(int level)4073 Program::optimizeSSA(int level)
4074 {
4075 RUN_PASS(1, DeadCodeElim, buryAll);
4076 RUN_PASS(1, CopyPropagation, run);
4077 RUN_PASS(1, MergeSplits, run);
4078 RUN_PASS(2, GlobalCSE, run);
4079 RUN_PASS(1, LocalCSE, run);
4080 RUN_PASS(2, AlgebraicOpt, run);
4081 RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
4082 RUN_PASS(1, ConstantFolding, foldAll);
4083 RUN_PASS(0, Split64BitOpPreRA, run);
4084 RUN_PASS(2, LateAlgebraicOpt, run);
4085 RUN_PASS(1, LoadPropagation, run);
4086 RUN_PASS(1, IndirectPropagation, run);
4087 RUN_PASS(4, MemoryOpt, run);
4088 RUN_PASS(2, LocalCSE, run);
4089 RUN_PASS(0, DeadCodeElim, buryAll);
4090
4091 return true;
4092 }
4093
4094 bool
optimizePostRA(int level)4095 Program::optimizePostRA(int level)
4096 {
4097 RUN_PASS(2, FlatteningPass, run);
4098 RUN_PASS(2, PostRaLoadPropagation, run);
4099
4100 return true;
4101 }
4102
4103 }
4104