1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for R600
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "R600ISelLowering.h"
16 #include "AMDGPUFrameLowering.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "R600Defines.h"
20 #include "R600InstrInfo.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/IR/Argument.h"
29 #include "llvm/IR/Function.h"
30
31 using namespace llvm;
32
R600TargetLowering(const TargetMachine & TM,const R600Subtarget & STI)33 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
34 const R600Subtarget &STI)
35 : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
37 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
38 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
39 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
40 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
41 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
42
43 computeRegisterProperties(STI.getRegisterInfo());
44
45 // Legalize loads and stores to the private address space.
46 setOperationAction(ISD::LOAD, MVT::i32, Custom);
47 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
48 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
49
50 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
51 // spaces, so it is custom lowered to handle those where it isn't.
52 for (MVT VT : MVT::integer_valuetypes()) {
53 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
54 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
55 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
56
57 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
58 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
59 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
60
61 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
62 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
63 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
64 }
65
66 // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
67 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
68 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
69 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
70
71 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
72 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
73 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
74
75
76 setOperationAction(ISD::STORE, MVT::i8, Custom);
77 setOperationAction(ISD::STORE, MVT::i32, Custom);
78 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
79 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
80
81 setTruncStoreAction(MVT::i32, MVT::i8, Custom);
82 setTruncStoreAction(MVT::i32, MVT::i16, Custom);
83
84 // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
85 setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
86 setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
87
88 // Set condition code actions
89 setCondCodeAction(ISD::SETO, MVT::f32, Expand);
90 setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
91 setCondCodeAction(ISD::SETLT, MVT::f32, Expand);
92 setCondCodeAction(ISD::SETLE, MVT::f32, Expand);
93 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
94 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
95 setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
96 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
97 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
98 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
99 setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
100 setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
101
102 setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
103 setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
104 setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
105 setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
106
107 setOperationAction(ISD::FCOS, MVT::f32, Custom);
108 setOperationAction(ISD::FSIN, MVT::f32, Custom);
109
110 setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
111 setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
112
113 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
114 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
115 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
116
117 setOperationAction(ISD::FSUB, MVT::f32, Expand);
118
119 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
120 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
121
122 setOperationAction(ISD::SETCC, MVT::i32, Expand);
123 setOperationAction(ISD::SETCC, MVT::f32, Expand);
124 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
125 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
126 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
127
128 setOperationAction(ISD::SELECT, MVT::i32, Expand);
129 setOperationAction(ISD::SELECT, MVT::f32, Expand);
130 setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
131 setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
132
133 // ADD, SUB overflow.
134 // TODO: turn these into Legal?
135 if (Subtarget->hasCARRY())
136 setOperationAction(ISD::UADDO, MVT::i32, Custom);
137
138 if (Subtarget->hasBORROW())
139 setOperationAction(ISD::USUBO, MVT::i32, Custom);
140
141 // Expand sign extension of vectors
142 if (!Subtarget->hasBFE())
143 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
144
145 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
146 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
147
148 if (!Subtarget->hasBFE())
149 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
150 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
151 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
152
153 if (!Subtarget->hasBFE())
154 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
155 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
156 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
157
158 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
159 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
160 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
161
162 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
163
164 setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
165
166 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
167 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
168 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
169 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
170
171 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
172 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
173 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
174 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
175
176 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
177 // to be Legal/Custom in order to avoid library calls.
178 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
179 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
180 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
181
182 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
183
184 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
185 for (MVT VT : ScalarIntVTs) {
186 setOperationAction(ISD::ADDC, VT, Expand);
187 setOperationAction(ISD::SUBC, VT, Expand);
188 setOperationAction(ISD::ADDE, VT, Expand);
189 setOperationAction(ISD::SUBE, VT, Expand);
190 }
191
192 setSchedulingPreference(Sched::Source);
193
194
195 setTargetDAGCombine(ISD::FP_ROUND);
196 setTargetDAGCombine(ISD::FP_TO_SINT);
197 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
198 setTargetDAGCombine(ISD::SELECT_CC);
199 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
200 }
201
getSubtarget() const202 const R600Subtarget *R600TargetLowering::getSubtarget() const {
203 return static_cast<const R600Subtarget *>(Subtarget);
204 }
205
isEOP(MachineBasicBlock::iterator I)206 static inline bool isEOP(MachineBasicBlock::iterator I) {
207 return std::next(I)->getOpcode() == AMDGPU::RETURN;
208 }
209
210 MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const211 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
212 MachineBasicBlock *BB) const {
213 MachineFunction * MF = BB->getParent();
214 MachineRegisterInfo &MRI = MF->getRegInfo();
215 MachineBasicBlock::iterator I = MI;
216 const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
217
218 switch (MI.getOpcode()) {
219 default:
220 // Replace LDS_*_RET instruction that don't have any uses with the
221 // equivalent LDS_*_NORET instruction.
222 if (TII->isLDSRetInstr(MI.getOpcode())) {
223 int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
224 assert(DstIdx != -1);
225 MachineInstrBuilder NewMI;
226 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
227 // LDS_1A2D support and remove this special case.
228 if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
229 MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
230 return BB;
231
232 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
233 TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
234 for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
235 NewMI.addOperand(MI.getOperand(i));
236 }
237 } else {
238 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
239 }
240 break;
241 case AMDGPU::CLAMP_R600: {
242 MachineInstr *NewMI = TII->buildDefaultInstruction(
243 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
244 MI.getOperand(1).getReg());
245 TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
246 break;
247 }
248
249 case AMDGPU::FABS_R600: {
250 MachineInstr *NewMI = TII->buildDefaultInstruction(
251 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
252 MI.getOperand(1).getReg());
253 TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
254 break;
255 }
256
257 case AMDGPU::FNEG_R600: {
258 MachineInstr *NewMI = TII->buildDefaultInstruction(
259 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
260 MI.getOperand(1).getReg());
261 TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
262 break;
263 }
264
265 case AMDGPU::MASK_WRITE: {
266 unsigned maskedRegister = MI.getOperand(0).getReg();
267 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
268 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
269 TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
270 break;
271 }
272
273 case AMDGPU::MOV_IMM_F32:
274 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
275 .getFPImm()
276 ->getValueAPF()
277 .bitcastToAPInt()
278 .getZExtValue());
279 break;
280 case AMDGPU::MOV_IMM_I32:
281 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
282 MI.getOperand(1).getImm());
283 break;
284 case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
285 //TODO: Perhaps combine this instruction with the next if possible
286 auto MIB = TII->buildDefaultInstruction(
287 *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
288 int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
289 //TODO: Ugh this is rather ugly
290 MIB->getOperand(Idx) = MI.getOperand(1);
291 break;
292 }
293 case AMDGPU::CONST_COPY: {
294 MachineInstr *NewMI = TII->buildDefaultInstruction(
295 *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
296 TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
297 MI.getOperand(1).getImm());
298 break;
299 }
300
301 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
302 case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
303 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
304 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
305 .addOperand(MI.getOperand(0))
306 .addOperand(MI.getOperand(1))
307 .addImm(isEOP(I)); // Set End of program bit
308 break;
309 }
310 case AMDGPU::RAT_STORE_TYPED_eg: {
311 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
312 .addOperand(MI.getOperand(0))
313 .addOperand(MI.getOperand(1))
314 .addOperand(MI.getOperand(2))
315 .addImm(isEOP(I)); // Set End of program bit
316 break;
317 }
318
319 case AMDGPU::TXD: {
320 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
321 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
322 MachineOperand &RID = MI.getOperand(4);
323 MachineOperand &SID = MI.getOperand(5);
324 unsigned TextureId = MI.getOperand(6).getImm();
325 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
326 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
327
328 switch (TextureId) {
329 case 5: // Rect
330 CTX = CTY = 0;
331 break;
332 case 6: // Shadow1D
333 SrcW = SrcZ;
334 break;
335 case 7: // Shadow2D
336 SrcW = SrcZ;
337 break;
338 case 8: // ShadowRect
339 CTX = CTY = 0;
340 SrcW = SrcZ;
341 break;
342 case 9: // 1DArray
343 SrcZ = SrcY;
344 CTZ = 0;
345 break;
346 case 10: // 2DArray
347 CTZ = 0;
348 break;
349 case 11: // Shadow1DArray
350 SrcZ = SrcY;
351 CTZ = 0;
352 break;
353 case 12: // Shadow2DArray
354 CTZ = 0;
355 break;
356 }
357 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
358 T0)
359 .addOperand(MI.getOperand(3))
360 .addImm(SrcX)
361 .addImm(SrcY)
362 .addImm(SrcZ)
363 .addImm(SrcW)
364 .addImm(0)
365 .addImm(0)
366 .addImm(0)
367 .addImm(0)
368 .addImm(1)
369 .addImm(2)
370 .addImm(3)
371 .addOperand(RID)
372 .addOperand(SID)
373 .addImm(CTX)
374 .addImm(CTY)
375 .addImm(CTZ)
376 .addImm(CTW);
377 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
378 T1)
379 .addOperand(MI.getOperand(2))
380 .addImm(SrcX)
381 .addImm(SrcY)
382 .addImm(SrcZ)
383 .addImm(SrcW)
384 .addImm(0)
385 .addImm(0)
386 .addImm(0)
387 .addImm(0)
388 .addImm(1)
389 .addImm(2)
390 .addImm(3)
391 .addOperand(RID)
392 .addOperand(SID)
393 .addImm(CTX)
394 .addImm(CTY)
395 .addImm(CTZ)
396 .addImm(CTW);
397 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
398 .addOperand(MI.getOperand(0))
399 .addOperand(MI.getOperand(1))
400 .addImm(SrcX)
401 .addImm(SrcY)
402 .addImm(SrcZ)
403 .addImm(SrcW)
404 .addImm(0)
405 .addImm(0)
406 .addImm(0)
407 .addImm(0)
408 .addImm(1)
409 .addImm(2)
410 .addImm(3)
411 .addOperand(RID)
412 .addOperand(SID)
413 .addImm(CTX)
414 .addImm(CTY)
415 .addImm(CTZ)
416 .addImm(CTW)
417 .addReg(T0, RegState::Implicit)
418 .addReg(T1, RegState::Implicit);
419 break;
420 }
421
422 case AMDGPU::TXD_SHADOW: {
423 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
424 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
425 MachineOperand &RID = MI.getOperand(4);
426 MachineOperand &SID = MI.getOperand(5);
427 unsigned TextureId = MI.getOperand(6).getImm();
428 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
429 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
430
431 switch (TextureId) {
432 case 5: // Rect
433 CTX = CTY = 0;
434 break;
435 case 6: // Shadow1D
436 SrcW = SrcZ;
437 break;
438 case 7: // Shadow2D
439 SrcW = SrcZ;
440 break;
441 case 8: // ShadowRect
442 CTX = CTY = 0;
443 SrcW = SrcZ;
444 break;
445 case 9: // 1DArray
446 SrcZ = SrcY;
447 CTZ = 0;
448 break;
449 case 10: // 2DArray
450 CTZ = 0;
451 break;
452 case 11: // Shadow1DArray
453 SrcZ = SrcY;
454 CTZ = 0;
455 break;
456 case 12: // Shadow2DArray
457 CTZ = 0;
458 break;
459 }
460
461 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
462 T0)
463 .addOperand(MI.getOperand(3))
464 .addImm(SrcX)
465 .addImm(SrcY)
466 .addImm(SrcZ)
467 .addImm(SrcW)
468 .addImm(0)
469 .addImm(0)
470 .addImm(0)
471 .addImm(0)
472 .addImm(1)
473 .addImm(2)
474 .addImm(3)
475 .addOperand(RID)
476 .addOperand(SID)
477 .addImm(CTX)
478 .addImm(CTY)
479 .addImm(CTZ)
480 .addImm(CTW);
481 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
482 T1)
483 .addOperand(MI.getOperand(2))
484 .addImm(SrcX)
485 .addImm(SrcY)
486 .addImm(SrcZ)
487 .addImm(SrcW)
488 .addImm(0)
489 .addImm(0)
490 .addImm(0)
491 .addImm(0)
492 .addImm(1)
493 .addImm(2)
494 .addImm(3)
495 .addOperand(RID)
496 .addOperand(SID)
497 .addImm(CTX)
498 .addImm(CTY)
499 .addImm(CTZ)
500 .addImm(CTW);
501 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
502 .addOperand(MI.getOperand(0))
503 .addOperand(MI.getOperand(1))
504 .addImm(SrcX)
505 .addImm(SrcY)
506 .addImm(SrcZ)
507 .addImm(SrcW)
508 .addImm(0)
509 .addImm(0)
510 .addImm(0)
511 .addImm(0)
512 .addImm(1)
513 .addImm(2)
514 .addImm(3)
515 .addOperand(RID)
516 .addOperand(SID)
517 .addImm(CTX)
518 .addImm(CTY)
519 .addImm(CTZ)
520 .addImm(CTW)
521 .addReg(T0, RegState::Implicit)
522 .addReg(T1, RegState::Implicit);
523 break;
524 }
525
526 case AMDGPU::BRANCH:
527 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
528 .addOperand(MI.getOperand(0));
529 break;
530
531 case AMDGPU::BRANCH_COND_f32: {
532 MachineInstr *NewMI =
533 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
534 AMDGPU::PREDICATE_BIT)
535 .addOperand(MI.getOperand(1))
536 .addImm(OPCODE_IS_NOT_ZERO)
537 .addImm(0); // Flags
538 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
539 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
540 .addOperand(MI.getOperand(0))
541 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
542 break;
543 }
544
545 case AMDGPU::BRANCH_COND_i32: {
546 MachineInstr *NewMI =
547 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
548 AMDGPU::PREDICATE_BIT)
549 .addOperand(MI.getOperand(1))
550 .addImm(OPCODE_IS_NOT_ZERO_INT)
551 .addImm(0); // Flags
552 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
553 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
554 .addOperand(MI.getOperand(0))
555 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
556 break;
557 }
558
559 case AMDGPU::EG_ExportSwz:
560 case AMDGPU::R600_ExportSwz: {
561 // Instruction is left unmodified if its not the last one of its type
562 bool isLastInstructionOfItsType = true;
563 unsigned InstExportType = MI.getOperand(1).getImm();
564 for (MachineBasicBlock::iterator NextExportInst = std::next(I),
565 EndBlock = BB->end(); NextExportInst != EndBlock;
566 NextExportInst = std::next(NextExportInst)) {
567 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
568 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
569 unsigned CurrentInstExportType = NextExportInst->getOperand(1)
570 .getImm();
571 if (CurrentInstExportType == InstExportType) {
572 isLastInstructionOfItsType = false;
573 break;
574 }
575 }
576 }
577 bool EOP = isEOP(I);
578 if (!EOP && !isLastInstructionOfItsType)
579 return BB;
580 unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
581 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
582 .addOperand(MI.getOperand(0))
583 .addOperand(MI.getOperand(1))
584 .addOperand(MI.getOperand(2))
585 .addOperand(MI.getOperand(3))
586 .addOperand(MI.getOperand(4))
587 .addOperand(MI.getOperand(5))
588 .addOperand(MI.getOperand(6))
589 .addImm(CfInst)
590 .addImm(EOP);
591 break;
592 }
593 case AMDGPU::RETURN: {
594 // RETURN instructions must have the live-out registers as implicit uses,
595 // otherwise they appear dead.
596 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
597 MachineInstrBuilder MIB(*MF, MI);
598 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
599 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
600 return BB;
601 }
602 }
603
604 MI.eraseFromParent();
605 return BB;
606 }
607
608 //===----------------------------------------------------------------------===//
609 // Custom DAG Lowering Operations
610 //===----------------------------------------------------------------------===//
611
LowerOperation(SDValue Op,SelectionDAG & DAG) const612 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
613 MachineFunction &MF = DAG.getMachineFunction();
614 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
615 switch (Op.getOpcode()) {
616 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
617 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
618 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
619 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
620 case ISD::SRA_PARTS:
621 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
622 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
623 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
624 case ISD::FCOS:
625 case ISD::FSIN: return LowerTrig(Op, DAG);
626 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
627 case ISD::STORE: return LowerSTORE(Op, DAG);
628 case ISD::LOAD: {
629 SDValue Result = LowerLOAD(Op, DAG);
630 assert((!Result.getNode() ||
631 Result.getNode()->getNumValues() == 2) &&
632 "Load should return a value and a chain");
633 return Result;
634 }
635
636 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
637 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
638 case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
639 case ISD::INTRINSIC_VOID: {
640 SDValue Chain = Op.getOperand(0);
641 unsigned IntrinsicID =
642 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
643 switch (IntrinsicID) {
644 case AMDGPUIntrinsic::R600_store_swizzle: {
645 SDLoc DL(Op);
646 const SDValue Args[8] = {
647 Chain,
648 Op.getOperand(2), // Export Value
649 Op.getOperand(3), // ArrayBase
650 Op.getOperand(4), // Type
651 DAG.getConstant(0, DL, MVT::i32), // SWZ_X
652 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
653 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
654 DAG.getConstant(3, DL, MVT::i32) // SWZ_W
655 };
656 return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
657 }
658
659 // default for switch(IntrinsicID)
660 default: break;
661 }
662 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
663 break;
664 }
665 case ISD::INTRINSIC_WO_CHAIN: {
666 unsigned IntrinsicID =
667 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
668 EVT VT = Op.getValueType();
669 SDLoc DL(Op);
670 switch(IntrinsicID) {
671 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
672 case AMDGPUIntrinsic::r600_tex:
673 case AMDGPUIntrinsic::r600_texc:
674 case AMDGPUIntrinsic::r600_txl:
675 case AMDGPUIntrinsic::r600_txlc:
676 case AMDGPUIntrinsic::r600_txb:
677 case AMDGPUIntrinsic::r600_txbc:
678 case AMDGPUIntrinsic::r600_txf:
679 case AMDGPUIntrinsic::r600_txq:
680 case AMDGPUIntrinsic::r600_ddx:
681 case AMDGPUIntrinsic::r600_ddy: {
682 unsigned TextureOp;
683 switch (IntrinsicID) {
684 case AMDGPUIntrinsic::r600_tex:
685 TextureOp = 0;
686 break;
687 case AMDGPUIntrinsic::r600_texc:
688 TextureOp = 1;
689 break;
690 case AMDGPUIntrinsic::r600_txl:
691 TextureOp = 2;
692 break;
693 case AMDGPUIntrinsic::r600_txlc:
694 TextureOp = 3;
695 break;
696 case AMDGPUIntrinsic::r600_txb:
697 TextureOp = 4;
698 break;
699 case AMDGPUIntrinsic::r600_txbc:
700 TextureOp = 5;
701 break;
702 case AMDGPUIntrinsic::r600_txf:
703 TextureOp = 6;
704 break;
705 case AMDGPUIntrinsic::r600_txq:
706 TextureOp = 7;
707 break;
708 case AMDGPUIntrinsic::r600_ddx:
709 TextureOp = 8;
710 break;
711 case AMDGPUIntrinsic::r600_ddy:
712 TextureOp = 9;
713 break;
714 default:
715 llvm_unreachable("Unknow Texture Operation");
716 }
717
718 SDValue TexArgs[19] = {
719 DAG.getConstant(TextureOp, DL, MVT::i32),
720 Op.getOperand(1),
721 DAG.getConstant(0, DL, MVT::i32),
722 DAG.getConstant(1, DL, MVT::i32),
723 DAG.getConstant(2, DL, MVT::i32),
724 DAG.getConstant(3, DL, MVT::i32),
725 Op.getOperand(2),
726 Op.getOperand(3),
727 Op.getOperand(4),
728 DAG.getConstant(0, DL, MVT::i32),
729 DAG.getConstant(1, DL, MVT::i32),
730 DAG.getConstant(2, DL, MVT::i32),
731 DAG.getConstant(3, DL, MVT::i32),
732 Op.getOperand(5),
733 Op.getOperand(6),
734 Op.getOperand(7),
735 Op.getOperand(8),
736 Op.getOperand(9),
737 Op.getOperand(10)
738 };
739 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
740 }
741 case AMDGPUIntrinsic::r600_dot4: {
742 SDValue Args[8] = {
743 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
744 DAG.getConstant(0, DL, MVT::i32)),
745 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
746 DAG.getConstant(0, DL, MVT::i32)),
747 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
748 DAG.getConstant(1, DL, MVT::i32)),
749 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
750 DAG.getConstant(1, DL, MVT::i32)),
751 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
752 DAG.getConstant(2, DL, MVT::i32)),
753 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
754 DAG.getConstant(2, DL, MVT::i32)),
755 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
756 DAG.getConstant(3, DL, MVT::i32)),
757 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
758 DAG.getConstant(3, DL, MVT::i32))
759 };
760 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
761 }
762
763 case Intrinsic::r600_implicitarg_ptr: {
764 MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
765 uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
766 return DAG.getConstant(ByteOffset, DL, PtrVT);
767 }
768 case Intrinsic::r600_read_ngroups_x:
769 return LowerImplicitParameter(DAG, VT, DL, 0);
770 case Intrinsic::r600_read_ngroups_y:
771 return LowerImplicitParameter(DAG, VT, DL, 1);
772 case Intrinsic::r600_read_ngroups_z:
773 return LowerImplicitParameter(DAG, VT, DL, 2);
774 case Intrinsic::r600_read_global_size_x:
775 return LowerImplicitParameter(DAG, VT, DL, 3);
776 case Intrinsic::r600_read_global_size_y:
777 return LowerImplicitParameter(DAG, VT, DL, 4);
778 case Intrinsic::r600_read_global_size_z:
779 return LowerImplicitParameter(DAG, VT, DL, 5);
780 case Intrinsic::r600_read_local_size_x:
781 return LowerImplicitParameter(DAG, VT, DL, 6);
782 case Intrinsic::r600_read_local_size_y:
783 return LowerImplicitParameter(DAG, VT, DL, 7);
784 case Intrinsic::r600_read_local_size_z:
785 return LowerImplicitParameter(DAG, VT, DL, 8);
786
787 case Intrinsic::r600_read_workdim:
788 case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
789 uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
790 return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
791 }
792
793 case Intrinsic::r600_read_tgid_x:
794 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
795 AMDGPU::T1_X, VT);
796 case Intrinsic::r600_read_tgid_y:
797 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
798 AMDGPU::T1_Y, VT);
799 case Intrinsic::r600_read_tgid_z:
800 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
801 AMDGPU::T1_Z, VT);
802 case Intrinsic::r600_read_tidig_x:
803 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
804 AMDGPU::T0_X, VT);
805 case Intrinsic::r600_read_tidig_y:
806 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
807 AMDGPU::T0_Y, VT);
808 case Intrinsic::r600_read_tidig_z:
809 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
810 AMDGPU::T0_Z, VT);
811
812 // FIXME: Should be renamed to r600 prefix
813 case AMDGPUIntrinsic::AMDGPU_rsq_clamped:
814 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
815
816 case Intrinsic::r600_rsq:
817 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
818 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
819 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
820 }
821 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
822 break;
823 }
824 } // end switch(Op.getOpcode())
825 return SDValue();
826 }
827
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const828 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
829 SmallVectorImpl<SDValue> &Results,
830 SelectionDAG &DAG) const {
831 switch (N->getOpcode()) {
832 default:
833 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
834 return;
835 case ISD::FP_TO_UINT:
836 if (N->getValueType(0) == MVT::i1) {
837 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
838 return;
839 }
840 // Fall-through. Since we don't care about out of bounds values
841 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
842 // considers some extra cases which are not necessary here.
843 case ISD::FP_TO_SINT: {
844 SDValue Result;
845 if (expandFP_TO_SINT(N, Result, DAG))
846 Results.push_back(Result);
847 return;
848 }
849 case ISD::SDIVREM: {
850 SDValue Op = SDValue(N, 1);
851 SDValue RES = LowerSDIVREM(Op, DAG);
852 Results.push_back(RES);
853 Results.push_back(RES.getValue(1));
854 break;
855 }
856 case ISD::UDIVREM: {
857 SDValue Op = SDValue(N, 0);
858 LowerUDIVREM64(Op, DAG, Results);
859 break;
860 }
861 }
862 }
863
vectorToVerticalVector(SelectionDAG & DAG,SDValue Vector) const864 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
865 SDValue Vector) const {
866
867 SDLoc DL(Vector);
868 EVT VecVT = Vector.getValueType();
869 EVT EltVT = VecVT.getVectorElementType();
870 SmallVector<SDValue, 8> Args;
871
872 for (unsigned i = 0, e = VecVT.getVectorNumElements();
873 i != e; ++i) {
874 Args.push_back(DAG.getNode(
875 ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
876 DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
877 }
878
879 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
880 }
881
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const882 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
883 SelectionDAG &DAG) const {
884
885 SDLoc DL(Op);
886 SDValue Vector = Op.getOperand(0);
887 SDValue Index = Op.getOperand(1);
888
889 if (isa<ConstantSDNode>(Index) ||
890 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
891 return Op;
892
893 Vector = vectorToVerticalVector(DAG, Vector);
894 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
895 Vector, Index);
896 }
897
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const898 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
899 SelectionDAG &DAG) const {
900 SDLoc DL(Op);
901 SDValue Vector = Op.getOperand(0);
902 SDValue Value = Op.getOperand(1);
903 SDValue Index = Op.getOperand(2);
904
905 if (isa<ConstantSDNode>(Index) ||
906 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
907 return Op;
908
909 Vector = vectorToVerticalVector(DAG, Vector);
910 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
911 Vector, Value, Index);
912 return vectorToVerticalVector(DAG, Insert);
913 }
914
LowerGlobalAddress(AMDGPUMachineFunction * MFI,SDValue Op,SelectionDAG & DAG) const915 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
916 SDValue Op,
917 SelectionDAG &DAG) const {
918
919 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
920 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
921 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
922
923 const DataLayout &DL = DAG.getDataLayout();
924 const GlobalValue *GV = GSD->getGlobal();
925 MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
926
927 SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
928 return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
929 }
930
LowerTrig(SDValue Op,SelectionDAG & DAG) const931 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
932 // On hw >= R700, COS/SIN input must be between -1. and 1.
933 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
934 EVT VT = Op.getValueType();
935 SDValue Arg = Op.getOperand(0);
936 SDLoc DL(Op);
937
938 // TODO: Should this propagate fast-math-flags?
939 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
940 DAG.getNode(ISD::FADD, DL, VT,
941 DAG.getNode(ISD::FMUL, DL, VT, Arg,
942 DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
943 DAG.getConstantFP(0.5, DL, MVT::f32)));
944 unsigned TrigNode;
945 switch (Op.getOpcode()) {
946 case ISD::FCOS:
947 TrigNode = AMDGPUISD::COS_HW;
948 break;
949 case ISD::FSIN:
950 TrigNode = AMDGPUISD::SIN_HW;
951 break;
952 default:
953 llvm_unreachable("Wrong trig opcode");
954 }
955 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
956 DAG.getNode(ISD::FADD, DL, VT, FractPart,
957 DAG.getConstantFP(-0.5, DL, MVT::f32)));
958 if (Gen >= R600Subtarget::R700)
959 return TrigVal;
960 // On R600 hw, COS/SIN input must be between -Pi and Pi.
961 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
962 DAG.getConstantFP(3.14159265359, DL, MVT::f32));
963 }
964
LowerSHLParts(SDValue Op,SelectionDAG & DAG) const965 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
966 SDLoc DL(Op);
967 EVT VT = Op.getValueType();
968
969 SDValue Lo = Op.getOperand(0);
970 SDValue Hi = Op.getOperand(1);
971 SDValue Shift = Op.getOperand(2);
972 SDValue Zero = DAG.getConstant(0, DL, VT);
973 SDValue One = DAG.getConstant(1, DL, VT);
974
975 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
976 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
977 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
978 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
979
980 // The dance around Width1 is necessary for 0 special case.
981 // Without it the CompShift might be 32, producing incorrect results in
982 // Overflow. So we do the shift in two steps, the alternative is to
983 // add a conditional to filter the special case.
984
985 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
986 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
987
988 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
989 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
990 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
991
992 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
993 SDValue LoBig = Zero;
994
995 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
996 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
997
998 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
999 }
1000
LowerSRXParts(SDValue Op,SelectionDAG & DAG) const1001 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1002 SDLoc DL(Op);
1003 EVT VT = Op.getValueType();
1004
1005 SDValue Lo = Op.getOperand(0);
1006 SDValue Hi = Op.getOperand(1);
1007 SDValue Shift = Op.getOperand(2);
1008 SDValue Zero = DAG.getConstant(0, DL, VT);
1009 SDValue One = DAG.getConstant(1, DL, VT);
1010
1011 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1012
1013 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1014 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1015 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1016 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1017
1018 // The dance around Width1 is necessary for 0 special case.
1019 // Without it the CompShift might be 32, producing incorrect results in
1020 // Overflow. So we do the shift in two steps, the alternative is to
1021 // add a conditional to filter the special case.
1022
1023 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1024 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1025
1026 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1027 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1028 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1029
1030 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1031 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1032
1033 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1034 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1035
1036 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1037 }
1038
LowerUADDSUBO(SDValue Op,SelectionDAG & DAG,unsigned mainop,unsigned ovf) const1039 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1040 unsigned mainop, unsigned ovf) const {
1041 SDLoc DL(Op);
1042 EVT VT = Op.getValueType();
1043
1044 SDValue Lo = Op.getOperand(0);
1045 SDValue Hi = Op.getOperand(1);
1046
1047 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1048 // Extend sign.
1049 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1050 DAG.getValueType(MVT::i1));
1051
1052 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1053
1054 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1055 }
1056
LowerFPTOUINT(SDValue Op,SelectionDAG & DAG) const1057 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1058 SDLoc DL(Op);
1059 return DAG.getNode(
1060 ISD::SETCC,
1061 DL,
1062 MVT::i1,
1063 Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1064 DAG.getCondCode(ISD::SETNE)
1065 );
1066 }
1067
LowerImplicitParameter(SelectionDAG & DAG,EVT VT,const SDLoc & DL,unsigned DwordOffset) const1068 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1069 const SDLoc &DL,
1070 unsigned DwordOffset) const {
1071 unsigned ByteOffset = DwordOffset * 4;
1072 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1073 AMDGPUAS::CONSTANT_BUFFER_0);
1074
1075 // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1076 assert(isInt<16>(ByteOffset));
1077
1078 return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1079 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1080 MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1081 false, false, false, 0);
1082 }
1083
isZero(SDValue Op) const1084 bool R600TargetLowering::isZero(SDValue Op) const {
1085 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1086 return Cst->isNullValue();
1087 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1088 return CstFP->isZero();
1089 } else {
1090 return false;
1091 }
1092 }
1093
isHWTrueValue(SDValue Op) const1094 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
1095 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1096 return CFP->isExactlyValue(1.0);
1097 }
1098 return isAllOnesConstant(Op);
1099 }
1100
isHWFalseValue(SDValue Op) const1101 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
1102 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1103 return CFP->getValueAPF().isZero();
1104 }
1105 return isNullConstant(Op);
1106 }
1107
LowerSELECT_CC(SDValue Op,SelectionDAG & DAG) const1108 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1109 SDLoc DL(Op);
1110 EVT VT = Op.getValueType();
1111
1112 SDValue LHS = Op.getOperand(0);
1113 SDValue RHS = Op.getOperand(1);
1114 SDValue True = Op.getOperand(2);
1115 SDValue False = Op.getOperand(3);
1116 SDValue CC = Op.getOperand(4);
1117 SDValue Temp;
1118
1119 if (VT == MVT::f32) {
1120 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1121 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1122 if (MinMax)
1123 return MinMax;
1124 }
1125
1126 // LHS and RHS are guaranteed to be the same value type
1127 EVT CompareVT = LHS.getValueType();
1128
1129 // Check if we can lower this to a native operation.
1130
1131 // Try to lower to a SET* instruction:
1132 //
1133 // SET* can match the following patterns:
1134 //
1135 // select_cc f32, f32, -1, 0, cc_supported
1136 // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1137 // select_cc i32, i32, -1, 0, cc_supported
1138 //
1139
1140 // Move hardware True/False values to the correct operand.
1141 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1142 ISD::CondCode InverseCC =
1143 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1144 if (isHWTrueValue(False) && isHWFalseValue(True)) {
1145 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1146 std::swap(False, True);
1147 CC = DAG.getCondCode(InverseCC);
1148 } else {
1149 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1150 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1151 std::swap(False, True);
1152 std::swap(LHS, RHS);
1153 CC = DAG.getCondCode(SwapInvCC);
1154 }
1155 }
1156 }
1157
1158 if (isHWTrueValue(True) && isHWFalseValue(False) &&
1159 (CompareVT == VT || VT == MVT::i32)) {
1160 // This can be matched by a SET* instruction.
1161 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1162 }
1163
1164 // Try to lower to a CND* instruction:
1165 //
1166 // CND* can match the following patterns:
1167 //
1168 // select_cc f32, 0.0, f32, f32, cc_supported
1169 // select_cc f32, 0.0, i32, i32, cc_supported
1170 // select_cc i32, 0, f32, f32, cc_supported
1171 // select_cc i32, 0, i32, i32, cc_supported
1172 //
1173
1174 // Try to move the zero value to the RHS
1175 if (isZero(LHS)) {
1176 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1177 // Try swapping the operands
1178 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1179 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1180 std::swap(LHS, RHS);
1181 CC = DAG.getCondCode(CCSwapped);
1182 } else {
1183 // Try inverting the conditon and then swapping the operands
1184 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1185 CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1186 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1187 std::swap(True, False);
1188 std::swap(LHS, RHS);
1189 CC = DAG.getCondCode(CCSwapped);
1190 }
1191 }
1192 }
1193 if (isZero(RHS)) {
1194 SDValue Cond = LHS;
1195 SDValue Zero = RHS;
1196 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1197 if (CompareVT != VT) {
1198 // Bitcast True / False to the correct types. This will end up being
1199 // a nop, but it allows us to define only a single pattern in the
1200 // .TD files for each CND* instruction rather than having to have
1201 // one pattern for integer True/False and one for fp True/False
1202 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1203 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1204 }
1205
1206 switch (CCOpcode) {
1207 case ISD::SETONE:
1208 case ISD::SETUNE:
1209 case ISD::SETNE:
1210 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1211 Temp = True;
1212 True = False;
1213 False = Temp;
1214 break;
1215 default:
1216 break;
1217 }
1218 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1219 Cond, Zero,
1220 True, False,
1221 DAG.getCondCode(CCOpcode));
1222 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1223 }
1224
1225 // If we make it this for it means we have no native instructions to handle
1226 // this SELECT_CC, so we must lower it.
1227 SDValue HWTrue, HWFalse;
1228
1229 if (CompareVT == MVT::f32) {
1230 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1231 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1232 } else if (CompareVT == MVT::i32) {
1233 HWTrue = DAG.getConstant(-1, DL, CompareVT);
1234 HWFalse = DAG.getConstant(0, DL, CompareVT);
1235 }
1236 else {
1237 llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1238 }
1239
1240 // Lower this unsupported SELECT_CC into a combination of two supported
1241 // SELECT_CC operations.
1242 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1243
1244 return DAG.getNode(ISD::SELECT_CC, DL, VT,
1245 Cond, HWFalse,
1246 True, False,
1247 DAG.getCondCode(ISD::SETNE));
1248 }
1249
1250 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to
1251 /// convert these pointers to a register index. Each register holds
1252 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1253 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1254 /// for indirect addressing.
stackPtrToRegIndex(SDValue Ptr,unsigned StackWidth,SelectionDAG & DAG) const1255 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1256 unsigned StackWidth,
1257 SelectionDAG &DAG) const {
1258 unsigned SRLPad;
1259 switch(StackWidth) {
1260 case 1:
1261 SRLPad = 2;
1262 break;
1263 case 2:
1264 SRLPad = 3;
1265 break;
1266 case 4:
1267 SRLPad = 4;
1268 break;
1269 default: llvm_unreachable("Invalid stack width");
1270 }
1271
1272 SDLoc DL(Ptr);
1273 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1274 DAG.getConstant(SRLPad, DL, MVT::i32));
1275 }
1276
getStackAddress(unsigned StackWidth,unsigned ElemIdx,unsigned & Channel,unsigned & PtrIncr) const1277 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1278 unsigned ElemIdx,
1279 unsigned &Channel,
1280 unsigned &PtrIncr) const {
1281 switch (StackWidth) {
1282 default:
1283 case 1:
1284 Channel = 0;
1285 if (ElemIdx > 0) {
1286 PtrIncr = 1;
1287 } else {
1288 PtrIncr = 0;
1289 }
1290 break;
1291 case 2:
1292 Channel = ElemIdx % 2;
1293 if (ElemIdx == 2) {
1294 PtrIncr = 1;
1295 } else {
1296 PtrIncr = 0;
1297 }
1298 break;
1299 case 4:
1300 Channel = ElemIdx;
1301 PtrIncr = 0;
1302 break;
1303 }
1304 }
1305
lowerPrivateTruncStore(StoreSDNode * Store,SelectionDAG & DAG) const1306 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1307 SelectionDAG &DAG) const {
1308 SDLoc DL(Store);
1309
1310 unsigned Mask = 0;
1311 if (Store->getMemoryVT() == MVT::i8) {
1312 Mask = 0xff;
1313 } else if (Store->getMemoryVT() == MVT::i16) {
1314 Mask = 0xffff;
1315 }
1316
1317 SDValue Chain = Store->getChain();
1318 SDValue BasePtr = Store->getBasePtr();
1319 EVT MemVT = Store->getMemoryVT();
1320
1321 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1322 DAG.getConstant(2, DL, MVT::i32));
1323 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1324 Chain, Ptr,
1325 DAG.getTargetConstant(0, DL, MVT::i32));
1326
1327 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1328 DAG.getConstant(0x3, DL, MVT::i32));
1329
1330 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1331 DAG.getConstant(3, DL, MVT::i32));
1332
1333 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1334 Store->getValue());
1335
1336 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1337
1338 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1339 MaskedValue, ShiftAmt);
1340
1341 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
1342 DAG.getConstant(Mask, DL, MVT::i32),
1343 ShiftAmt);
1344 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1345 DAG.getConstant(0xffffffff, DL, MVT::i32));
1346 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1347
1348 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1349 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1350 Chain, Value, Ptr,
1351 DAG.getTargetConstant(0, DL, MVT::i32));
1352 }
1353
LowerSTORE(SDValue Op,SelectionDAG & DAG) const1354 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1355 if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
1356 return Result;
1357
1358 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1359 unsigned AS = StoreNode->getAddressSpace();
1360 SDValue Value = StoreNode->getValue();
1361 EVT ValueVT = Value.getValueType();
1362
1363 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
1364 ValueVT.isVector()) {
1365 return SplitVectorStore(Op, DAG);
1366 }
1367
1368 SDLoc DL(Op);
1369 SDValue Chain = StoreNode->getChain();
1370 SDValue Ptr = StoreNode->getBasePtr();
1371
1372 if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1373 if (StoreNode->isTruncatingStore()) {
1374 EVT VT = Value.getValueType();
1375 assert(VT.bitsLE(MVT::i32));
1376 EVT MemVT = StoreNode->getMemoryVT();
1377 SDValue MaskConstant;
1378 if (MemVT == MVT::i8) {
1379 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1380 } else {
1381 assert(MemVT == MVT::i16);
1382 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1383 }
1384 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1385 DAG.getConstant(2, DL, MVT::i32));
1386 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1387 DAG.getConstant(0x00000003, DL, VT));
1388 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1389 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1390 DAG.getConstant(3, DL, VT));
1391 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1392 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1393 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1394 // vector instead.
1395 SDValue Src[4] = {
1396 ShiftedValue,
1397 DAG.getConstant(0, DL, MVT::i32),
1398 DAG.getConstant(0, DL, MVT::i32),
1399 Mask
1400 };
1401 SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1402 SDValue Args[3] = { Chain, Input, DWordAddr };
1403 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1404 Op->getVTList(), Args, MemVT,
1405 StoreNode->getMemOperand());
1406 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1407 ValueVT.bitsGE(MVT::i32)) {
1408 // Convert pointer from byte address to dword address.
1409 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1410 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1411 Ptr, DAG.getConstant(2, DL, MVT::i32)));
1412
1413 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1414 llvm_unreachable("Truncated and indexed stores not supported yet");
1415 } else {
1416 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1417 }
1418 return Chain;
1419 }
1420 }
1421
1422 if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1423 return SDValue();
1424
1425 EVT MemVT = StoreNode->getMemoryVT();
1426 if (MemVT.bitsLT(MVT::i32))
1427 return lowerPrivateTruncStore(StoreNode, DAG);
1428
1429 // Lowering for indirect addressing
1430 const MachineFunction &MF = DAG.getMachineFunction();
1431 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1432 unsigned StackWidth = TFL->getStackWidth(MF);
1433
1434 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1435
1436 if (ValueVT.isVector()) {
1437 unsigned NumElemVT = ValueVT.getVectorNumElements();
1438 EVT ElemVT = ValueVT.getVectorElementType();
1439 SmallVector<SDValue, 4> Stores(NumElemVT);
1440
1441 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1442 "vector width in load");
1443
1444 for (unsigned i = 0; i < NumElemVT; ++i) {
1445 unsigned Channel, PtrIncr;
1446 getStackAddress(StackWidth, i, Channel, PtrIncr);
1447 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1448 DAG.getConstant(PtrIncr, DL, MVT::i32));
1449 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1450 Value, DAG.getConstant(i, DL, MVT::i32));
1451
1452 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1453 Chain, Elem, Ptr,
1454 DAG.getTargetConstant(Channel, DL, MVT::i32));
1455 }
1456 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1457 } else {
1458 if (ValueVT == MVT::i8) {
1459 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1460 }
1461 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1462 DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1463 }
1464
1465 return Chain;
1466 }
1467
1468 // return (512 + (kc_bank << 12)
1469 static int
ConstantAddressBlock(unsigned AddressSpace)1470 ConstantAddressBlock(unsigned AddressSpace) {
1471 switch (AddressSpace) {
1472 case AMDGPUAS::CONSTANT_BUFFER_0:
1473 return 512;
1474 case AMDGPUAS::CONSTANT_BUFFER_1:
1475 return 512 + 4096;
1476 case AMDGPUAS::CONSTANT_BUFFER_2:
1477 return 512 + 4096 * 2;
1478 case AMDGPUAS::CONSTANT_BUFFER_3:
1479 return 512 + 4096 * 3;
1480 case AMDGPUAS::CONSTANT_BUFFER_4:
1481 return 512 + 4096 * 4;
1482 case AMDGPUAS::CONSTANT_BUFFER_5:
1483 return 512 + 4096 * 5;
1484 case AMDGPUAS::CONSTANT_BUFFER_6:
1485 return 512 + 4096 * 6;
1486 case AMDGPUAS::CONSTANT_BUFFER_7:
1487 return 512 + 4096 * 7;
1488 case AMDGPUAS::CONSTANT_BUFFER_8:
1489 return 512 + 4096 * 8;
1490 case AMDGPUAS::CONSTANT_BUFFER_9:
1491 return 512 + 4096 * 9;
1492 case AMDGPUAS::CONSTANT_BUFFER_10:
1493 return 512 + 4096 * 10;
1494 case AMDGPUAS::CONSTANT_BUFFER_11:
1495 return 512 + 4096 * 11;
1496 case AMDGPUAS::CONSTANT_BUFFER_12:
1497 return 512 + 4096 * 12;
1498 case AMDGPUAS::CONSTANT_BUFFER_13:
1499 return 512 + 4096 * 13;
1500 case AMDGPUAS::CONSTANT_BUFFER_14:
1501 return 512 + 4096 * 14;
1502 case AMDGPUAS::CONSTANT_BUFFER_15:
1503 return 512 + 4096 * 15;
1504 default:
1505 return -1;
1506 }
1507 }
1508
lowerPrivateExtLoad(SDValue Op,SelectionDAG & DAG) const1509 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1510 SelectionDAG &DAG) const {
1511 SDLoc DL(Op);
1512 LoadSDNode *Load = cast<LoadSDNode>(Op);
1513 ISD::LoadExtType ExtType = Load->getExtensionType();
1514 EVT MemVT = Load->getMemoryVT();
1515
1516 // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
1517 // register (2-)byte extract.
1518
1519 // Get Register holding the target.
1520 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1521 DAG.getConstant(2, DL, MVT::i32));
1522 // Load the Register.
1523 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1524 Load->getChain(),
1525 Ptr,
1526 DAG.getTargetConstant(0, DL, MVT::i32),
1527 Op.getOperand(2));
1528
1529 // Get offset within the register.
1530 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1531 Load->getBasePtr(),
1532 DAG.getConstant(0x3, DL, MVT::i32));
1533
1534 // Bit offset of target byte (byteIdx * 8).
1535 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1536 DAG.getConstant(3, DL, MVT::i32));
1537
1538 // Shift to the right.
1539 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1540
1541 // Eliminate the upper bits by setting them to ...
1542 EVT MemEltVT = MemVT.getScalarType();
1543
1544 // ... ones.
1545 if (ExtType == ISD::SEXTLOAD) {
1546 SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1547
1548 SDValue Ops[] = {
1549 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1550 Load->getChain()
1551 };
1552
1553 return DAG.getMergeValues(Ops, DL);
1554 }
1555
1556 // ... or zeros.
1557 SDValue Ops[] = {
1558 DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1559 Load->getChain()
1560 };
1561
1562 return DAG.getMergeValues(Ops, DL);
1563 }
1564
LowerLOAD(SDValue Op,SelectionDAG & DAG) const1565 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1566 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1567 unsigned AS = LoadNode->getAddressSpace();
1568 EVT MemVT = LoadNode->getMemoryVT();
1569 ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1570
1571 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1572 ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1573 return lowerPrivateExtLoad(Op, DAG);
1574 }
1575
1576 SDLoc DL(Op);
1577 EVT VT = Op.getValueType();
1578 SDValue Chain = LoadNode->getChain();
1579 SDValue Ptr = LoadNode->getBasePtr();
1580
1581 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1582 SDValue MergedValues[2] = {
1583 scalarizeVectorLoad(LoadNode, DAG),
1584 Chain
1585 };
1586 return DAG.getMergeValues(MergedValues, DL);
1587 }
1588
1589 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1590 if (ConstantBlock > -1 &&
1591 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1592 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1593 SDValue Result;
1594 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1595 isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1596 isa<ConstantSDNode>(Ptr)) {
1597 SDValue Slots[4];
1598 for (unsigned i = 0; i < 4; i++) {
1599 // We want Const position encoded with the following formula :
1600 // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1601 // const_index is Ptr computed by llvm using an alignment of 16.
1602 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1603 // then div by 4 at the ISel step
1604 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1605 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1606 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1607 }
1608 EVT NewVT = MVT::v4i32;
1609 unsigned NumElements = 4;
1610 if (VT.isVector()) {
1611 NewVT = VT;
1612 NumElements = VT.getVectorNumElements();
1613 }
1614 Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1615 } else {
1616 // non-constant ptr can't be folded, keeps it as a v4f32 load
1617 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1618 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1619 DAG.getConstant(4, DL, MVT::i32)),
1620 DAG.getConstant(LoadNode->getAddressSpace() -
1621 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1622 );
1623 }
1624
1625 if (!VT.isVector()) {
1626 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1627 DAG.getConstant(0, DL, MVT::i32));
1628 }
1629
1630 SDValue MergedValues[2] = {
1631 Result,
1632 Chain
1633 };
1634 return DAG.getMergeValues(MergedValues, DL);
1635 }
1636
1637 SDValue LoweredLoad;
1638
1639 // For most operations returning SDValue() will result in the node being
1640 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1641 // need to manually expand loads that may be legal in some address spaces and
1642 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1643 // compute shaders, since the data is sign extended when it is uploaded to the
1644 // buffer. However SEXT loads from other address spaces are not supported, so
1645 // we need to expand them here.
1646 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1647 EVT MemVT = LoadNode->getMemoryVT();
1648 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1649 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1650 LoadNode->getPointerInfo(), MemVT,
1651 LoadNode->isVolatile(),
1652 LoadNode->isNonTemporal(),
1653 LoadNode->isInvariant(),
1654 LoadNode->getAlignment());
1655 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1656 DAG.getValueType(MemVT));
1657
1658 SDValue MergedValues[2] = { Res, Chain };
1659 return DAG.getMergeValues(MergedValues, DL);
1660 }
1661
1662 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1663 return SDValue();
1664 }
1665
1666 // Lowering for indirect addressing
1667 const MachineFunction &MF = DAG.getMachineFunction();
1668 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1669 unsigned StackWidth = TFL->getStackWidth(MF);
1670
1671 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1672
1673 if (VT.isVector()) {
1674 unsigned NumElemVT = VT.getVectorNumElements();
1675 EVT ElemVT = VT.getVectorElementType();
1676 SDValue Loads[4];
1677
1678 assert(NumElemVT <= 4);
1679 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1680 "vector width in load");
1681
1682 for (unsigned i = 0; i < NumElemVT; ++i) {
1683 unsigned Channel, PtrIncr;
1684 getStackAddress(StackWidth, i, Channel, PtrIncr);
1685 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1686 DAG.getConstant(PtrIncr, DL, MVT::i32));
1687 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1688 Chain, Ptr,
1689 DAG.getTargetConstant(Channel, DL, MVT::i32),
1690 Op.getOperand(2));
1691 }
1692 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
1693 LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
1694 } else {
1695 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1696 Chain, Ptr,
1697 DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1698 Op.getOperand(2));
1699 }
1700
1701 SDValue Ops[2] = {
1702 LoweredLoad,
1703 Chain
1704 };
1705
1706 return DAG.getMergeValues(Ops, DL);
1707 }
1708
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const1709 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1710 SDValue Chain = Op.getOperand(0);
1711 SDValue Cond = Op.getOperand(1);
1712 SDValue Jump = Op.getOperand(2);
1713
1714 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1715 Chain, Jump, Cond);
1716 }
1717
lowerFrameIndex(SDValue Op,SelectionDAG & DAG) const1718 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1719 SelectionDAG &DAG) const {
1720 MachineFunction &MF = DAG.getMachineFunction();
1721 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1722
1723 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1724
1725 unsigned FrameIndex = FIN->getIndex();
1726 unsigned IgnoredFrameReg;
1727 unsigned Offset =
1728 TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1729 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
1730 Op.getValueType());
1731 }
1732
1733 /// XXX Only kernel functions are supported, so we can assume for now that
1734 /// every function is a kernel function, but in the future we should use
1735 /// separate calling conventions for kernel and non-kernel functions.
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const1736 SDValue R600TargetLowering::LowerFormalArguments(
1737 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1738 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1739 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1740 SmallVector<CCValAssign, 16> ArgLocs;
1741 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1742 *DAG.getContext());
1743 MachineFunction &MF = DAG.getMachineFunction();
1744 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1745
1746 SmallVector<ISD::InputArg, 8> LocalIns;
1747
1748 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1749
1750 AnalyzeFormalArguments(CCInfo, LocalIns);
1751
1752 for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1753 CCValAssign &VA = ArgLocs[i];
1754 const ISD::InputArg &In = Ins[i];
1755 EVT VT = In.VT;
1756 EVT MemVT = VA.getLocVT();
1757 if (!VT.isVector() && MemVT.isVector()) {
1758 // Get load source type if scalarized.
1759 MemVT = MemVT.getVectorElementType();
1760 }
1761
1762 if (AMDGPU::isShader(CallConv)) {
1763 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1764 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1765 InVals.push_back(Register);
1766 continue;
1767 }
1768
1769 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1770 AMDGPUAS::CONSTANT_BUFFER_0);
1771
1772 // i64 isn't a legal type, so the register type used ends up as i32, which
1773 // isn't expected here. It attempts to create this sextload, but it ends up
1774 // being invalid. Somehow this seems to work with i64 arguments, but breaks
1775 // for <1 x i64>.
1776
1777 // The first 36 bytes of the input buffer contains information about
1778 // thread group and global sizes.
1779 ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1780 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1781 // FIXME: This should really check the extload type, but the handling of
1782 // extload vector parameters seems to be broken.
1783
1784 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1785 Ext = ISD::SEXTLOAD;
1786 }
1787
1788 // Compute the offset from the value.
1789 // XXX - I think PartOffset should give you this, but it seems to give the
1790 // size of the register which isn't useful.
1791
1792 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1793 unsigned PartOffset = VA.getLocMemOffset();
1794 unsigned Offset = 36 + VA.getLocMemOffset();
1795
1796 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1797 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1798 DAG.getConstant(Offset, DL, MVT::i32),
1799 DAG.getUNDEF(MVT::i32),
1800 PtrInfo,
1801 MemVT, false, true, true, 4);
1802
1803 // 4 is the preferred alignment for the CONSTANT memory space.
1804 InVals.push_back(Arg);
1805 MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1806 }
1807 return Chain;
1808 }
1809
getSetCCResultType(const DataLayout & DL,LLVMContext &,EVT VT) const1810 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1811 EVT VT) const {
1812 if (!VT.isVector())
1813 return MVT::i32;
1814 return VT.changeVectorElementTypeToInteger();
1815 }
1816
allowsMisalignedMemoryAccesses(EVT VT,unsigned AddrSpace,unsigned Align,bool * IsFast) const1817 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1818 unsigned AddrSpace,
1819 unsigned Align,
1820 bool *IsFast) const {
1821 if (IsFast)
1822 *IsFast = false;
1823
1824 if (!VT.isSimple() || VT == MVT::Other)
1825 return false;
1826
1827 if (VT.bitsLT(MVT::i32))
1828 return false;
1829
1830 // TODO: This is a rough estimate.
1831 if (IsFast)
1832 *IsFast = true;
1833
1834 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1835 }
1836
CompactSwizzlableVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1837 static SDValue CompactSwizzlableVector(
1838 SelectionDAG &DAG, SDValue VectorEntry,
1839 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1840 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1841 assert(RemapSwizzle.empty());
1842 SDValue NewBldVec[4] = {
1843 VectorEntry.getOperand(0),
1844 VectorEntry.getOperand(1),
1845 VectorEntry.getOperand(2),
1846 VectorEntry.getOperand(3)
1847 };
1848
1849 for (unsigned i = 0; i < 4; i++) {
1850 if (NewBldVec[i].isUndef())
1851 // We mask write here to teach later passes that the ith element of this
1852 // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1853 // break false dependencies and additionnaly make assembly easier to read.
1854 RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1855 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1856 if (C->isZero()) {
1857 RemapSwizzle[i] = 4; // SEL_0
1858 NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1859 } else if (C->isExactlyValue(1.0)) {
1860 RemapSwizzle[i] = 5; // SEL_1
1861 NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1862 }
1863 }
1864
1865 if (NewBldVec[i].isUndef())
1866 continue;
1867 for (unsigned j = 0; j < i; j++) {
1868 if (NewBldVec[i] == NewBldVec[j]) {
1869 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1870 RemapSwizzle[i] = j;
1871 break;
1872 }
1873 }
1874 }
1875
1876 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1877 NewBldVec);
1878 }
1879
ReorganizeVector(SelectionDAG & DAG,SDValue VectorEntry,DenseMap<unsigned,unsigned> & RemapSwizzle)1880 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1881 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1882 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1883 assert(RemapSwizzle.empty());
1884 SDValue NewBldVec[4] = {
1885 VectorEntry.getOperand(0),
1886 VectorEntry.getOperand(1),
1887 VectorEntry.getOperand(2),
1888 VectorEntry.getOperand(3)
1889 };
1890 bool isUnmovable[4] = { false, false, false, false };
1891 for (unsigned i = 0; i < 4; i++) {
1892 RemapSwizzle[i] = i;
1893 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1894 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1895 ->getZExtValue();
1896 if (i == Idx)
1897 isUnmovable[Idx] = true;
1898 }
1899 }
1900
1901 for (unsigned i = 0; i < 4; i++) {
1902 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1903 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1904 ->getZExtValue();
1905 if (isUnmovable[Idx])
1906 continue;
1907 // Swap i and Idx
1908 std::swap(NewBldVec[Idx], NewBldVec[i]);
1909 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1910 break;
1911 }
1912 }
1913
1914 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1915 NewBldVec);
1916 }
1917
OptimizeSwizzle(SDValue BuildVector,SDValue Swz[4],SelectionDAG & DAG,const SDLoc & DL) const1918 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
1919 SelectionDAG &DAG,
1920 const SDLoc &DL) const {
1921 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1922 // Old -> New swizzle values
1923 DenseMap<unsigned, unsigned> SwizzleRemap;
1924
1925 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1926 for (unsigned i = 0; i < 4; i++) {
1927 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1928 if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1929 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1930 }
1931
1932 SwizzleRemap.clear();
1933 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1934 for (unsigned i = 0; i < 4; i++) {
1935 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1936 if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1937 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1938 }
1939
1940 return BuildVector;
1941 }
1942
1943
1944 //===----------------------------------------------------------------------===//
1945 // Custom DAG Optimizations
1946 //===----------------------------------------------------------------------===//
1947
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const1948 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1949 DAGCombinerInfo &DCI) const {
1950 SelectionDAG &DAG = DCI.DAG;
1951
1952 switch (N->getOpcode()) {
1953 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1954 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1955 case ISD::FP_ROUND: {
1956 SDValue Arg = N->getOperand(0);
1957 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1958 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1959 Arg.getOperand(0));
1960 }
1961 break;
1962 }
1963
1964 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1965 // (i32 select_cc f32, f32, -1, 0 cc)
1966 //
1967 // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1968 // this to one of the SET*_DX10 instructions.
1969 case ISD::FP_TO_SINT: {
1970 SDValue FNeg = N->getOperand(0);
1971 if (FNeg.getOpcode() != ISD::FNEG) {
1972 return SDValue();
1973 }
1974 SDValue SelectCC = FNeg.getOperand(0);
1975 if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1976 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1977 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1978 !isHWTrueValue(SelectCC.getOperand(2)) ||
1979 !isHWFalseValue(SelectCC.getOperand(3))) {
1980 return SDValue();
1981 }
1982
1983 SDLoc dl(N);
1984 return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1985 SelectCC.getOperand(0), // LHS
1986 SelectCC.getOperand(1), // RHS
1987 DAG.getConstant(-1, dl, MVT::i32), // True
1988 DAG.getConstant(0, dl, MVT::i32), // False
1989 SelectCC.getOperand(4)); // CC
1990
1991 break;
1992 }
1993
1994 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1995 // => build_vector elt0, ... , NewEltIdx, ... , eltN
1996 case ISD::INSERT_VECTOR_ELT: {
1997 SDValue InVec = N->getOperand(0);
1998 SDValue InVal = N->getOperand(1);
1999 SDValue EltNo = N->getOperand(2);
2000 SDLoc dl(N);
2001
2002 // If the inserted element is an UNDEF, just use the input vector.
2003 if (InVal.isUndef())
2004 return InVec;
2005
2006 EVT VT = InVec.getValueType();
2007
2008 // If we can't generate a legal BUILD_VECTOR, exit
2009 if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
2010 return SDValue();
2011
2012 // Check that we know which element is being inserted
2013 if (!isa<ConstantSDNode>(EltNo))
2014 return SDValue();
2015 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
2016
2017 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
2018 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the
2019 // vector elements.
2020 SmallVector<SDValue, 8> Ops;
2021 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
2022 Ops.append(InVec.getNode()->op_begin(),
2023 InVec.getNode()->op_end());
2024 } else if (InVec.isUndef()) {
2025 unsigned NElts = VT.getVectorNumElements();
2026 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
2027 } else {
2028 return SDValue();
2029 }
2030
2031 // Insert the element
2032 if (Elt < Ops.size()) {
2033 // All the operands of BUILD_VECTOR must have the same type;
2034 // we enforce that here.
2035 EVT OpVT = Ops[0].getValueType();
2036 if (InVal.getValueType() != OpVT)
2037 InVal = OpVT.bitsGT(InVal.getValueType()) ?
2038 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
2039 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
2040 Ops[Elt] = InVal;
2041 }
2042
2043 // Return the new vector
2044 return DAG.getBuildVector(VT, dl, Ops);
2045 }
2046
2047 // Extract_vec (Build_vector) generated by custom lowering
2048 // also needs to be customly combined
2049 case ISD::EXTRACT_VECTOR_ELT: {
2050 SDValue Arg = N->getOperand(0);
2051 if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
2052 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2053 unsigned Element = Const->getZExtValue();
2054 return Arg->getOperand(Element);
2055 }
2056 }
2057 if (Arg.getOpcode() == ISD::BITCAST &&
2058 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
2059 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2060 unsigned Element = Const->getZExtValue();
2061 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
2062 Arg->getOperand(0).getOperand(Element));
2063 }
2064 }
2065 break;
2066 }
2067
2068 case ISD::SELECT_CC: {
2069 // Try common optimizations
2070 if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
2071 return Ret;
2072
2073 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2074 // selectcc x, y, a, b, inv(cc)
2075 //
2076 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2077 // selectcc x, y, a, b, cc
2078 SDValue LHS = N->getOperand(0);
2079 if (LHS.getOpcode() != ISD::SELECT_CC) {
2080 return SDValue();
2081 }
2082
2083 SDValue RHS = N->getOperand(1);
2084 SDValue True = N->getOperand(2);
2085 SDValue False = N->getOperand(3);
2086 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2087
2088 if (LHS.getOperand(2).getNode() != True.getNode() ||
2089 LHS.getOperand(3).getNode() != False.getNode() ||
2090 RHS.getNode() != False.getNode()) {
2091 return SDValue();
2092 }
2093
2094 switch (NCC) {
2095 default: return SDValue();
2096 case ISD::SETNE: return LHS;
2097 case ISD::SETEQ: {
2098 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2099 LHSCC = ISD::getSetCCInverse(LHSCC,
2100 LHS.getOperand(0).getValueType().isInteger());
2101 if (DCI.isBeforeLegalizeOps() ||
2102 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2103 return DAG.getSelectCC(SDLoc(N),
2104 LHS.getOperand(0),
2105 LHS.getOperand(1),
2106 LHS.getOperand(2),
2107 LHS.getOperand(3),
2108 LHSCC);
2109 break;
2110 }
2111 }
2112 return SDValue();
2113 }
2114
2115 case AMDGPUISD::EXPORT: {
2116 SDValue Arg = N->getOperand(1);
2117 if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2118 break;
2119
2120 SDValue NewArgs[8] = {
2121 N->getOperand(0), // Chain
2122 SDValue(),
2123 N->getOperand(2), // ArrayBase
2124 N->getOperand(3), // Type
2125 N->getOperand(4), // SWZ_X
2126 N->getOperand(5), // SWZ_Y
2127 N->getOperand(6), // SWZ_Z
2128 N->getOperand(7) // SWZ_W
2129 };
2130 SDLoc DL(N);
2131 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2132 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2133 }
2134 case AMDGPUISD::TEXTURE_FETCH: {
2135 SDValue Arg = N->getOperand(1);
2136 if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2137 break;
2138
2139 SDValue NewArgs[19] = {
2140 N->getOperand(0),
2141 N->getOperand(1),
2142 N->getOperand(2),
2143 N->getOperand(3),
2144 N->getOperand(4),
2145 N->getOperand(5),
2146 N->getOperand(6),
2147 N->getOperand(7),
2148 N->getOperand(8),
2149 N->getOperand(9),
2150 N->getOperand(10),
2151 N->getOperand(11),
2152 N->getOperand(12),
2153 N->getOperand(13),
2154 N->getOperand(14),
2155 N->getOperand(15),
2156 N->getOperand(16),
2157 N->getOperand(17),
2158 N->getOperand(18),
2159 };
2160 SDLoc DL(N);
2161 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2162 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2163 }
2164 }
2165
2166 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2167 }
2168
FoldOperand(SDNode * ParentNode,unsigned SrcIdx,SDValue & Src,SDValue & Neg,SDValue & Abs,SDValue & Sel,SDValue & Imm,SelectionDAG & DAG) const2169 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
2170 SDValue &Src, SDValue &Neg, SDValue &Abs,
2171 SDValue &Sel, SDValue &Imm,
2172 SelectionDAG &DAG) const {
2173 const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2174 if (!Src.isMachineOpcode())
2175 return false;
2176
2177 switch (Src.getMachineOpcode()) {
2178 case AMDGPU::FNEG_R600:
2179 if (!Neg.getNode())
2180 return false;
2181 Src = Src.getOperand(0);
2182 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2183 return true;
2184 case AMDGPU::FABS_R600:
2185 if (!Abs.getNode())
2186 return false;
2187 Src = Src.getOperand(0);
2188 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2189 return true;
2190 case AMDGPU::CONST_COPY: {
2191 unsigned Opcode = ParentNode->getMachineOpcode();
2192 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2193
2194 if (!Sel.getNode())
2195 return false;
2196
2197 SDValue CstOffset = Src.getOperand(0);
2198 if (ParentNode->getValueType(0).isVector())
2199 return false;
2200
2201 // Gather constants values
2202 int SrcIndices[] = {
2203 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2204 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2205 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2206 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2211 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2212 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2213 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2214 };
2215 std::vector<unsigned> Consts;
2216 for (int OtherSrcIdx : SrcIndices) {
2217 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2218 if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2219 continue;
2220 if (HasDst) {
2221 OtherSrcIdx--;
2222 OtherSelIdx--;
2223 }
2224 if (RegisterSDNode *Reg =
2225 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2226 if (Reg->getReg() == AMDGPU::ALU_CONST) {
2227 ConstantSDNode *Cst
2228 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2229 Consts.push_back(Cst->getZExtValue());
2230 }
2231 }
2232 }
2233
2234 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2235 Consts.push_back(Cst->getZExtValue());
2236 if (!TII->fitsConstReadLimitations(Consts)) {
2237 return false;
2238 }
2239
2240 Sel = CstOffset;
2241 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2242 return true;
2243 }
2244 case AMDGPU::MOV_IMM_GLOBAL_ADDR:
2245 // Check if the Imm slot is used. Taken from below.
2246 if (cast<ConstantSDNode>(Imm)->getZExtValue())
2247 return false;
2248 Imm = Src.getOperand(0);
2249 Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
2250 return true;
2251 case AMDGPU::MOV_IMM_I32:
2252 case AMDGPU::MOV_IMM_F32: {
2253 unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2254 uint64_t ImmValue = 0;
2255
2256
2257 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2258 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2259 float FloatValue = FPC->getValueAPF().convertToFloat();
2260 if (FloatValue == 0.0) {
2261 ImmReg = AMDGPU::ZERO;
2262 } else if (FloatValue == 0.5) {
2263 ImmReg = AMDGPU::HALF;
2264 } else if (FloatValue == 1.0) {
2265 ImmReg = AMDGPU::ONE;
2266 } else {
2267 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2268 }
2269 } else {
2270 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2271 uint64_t Value = C->getZExtValue();
2272 if (Value == 0) {
2273 ImmReg = AMDGPU::ZERO;
2274 } else if (Value == 1) {
2275 ImmReg = AMDGPU::ONE_INT;
2276 } else {
2277 ImmValue = Value;
2278 }
2279 }
2280
2281 // Check that we aren't already using an immediate.
2282 // XXX: It's possible for an instruction to have more than one
2283 // immediate operand, but this is not supported yet.
2284 if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2285 if (!Imm.getNode())
2286 return false;
2287 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2288 assert(C);
2289 if (C->getZExtValue())
2290 return false;
2291 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2292 }
2293 Src = DAG.getRegister(ImmReg, MVT::i32);
2294 return true;
2295 }
2296 default:
2297 return false;
2298 }
2299 }
2300
2301 /// \brief Fold the instructions after selecting them
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const2302 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2303 SelectionDAG &DAG) const {
2304 const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2305 if (!Node->isMachineOpcode())
2306 return Node;
2307
2308 unsigned Opcode = Node->getMachineOpcode();
2309 SDValue FakeOp;
2310
2311 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2312
2313 if (Opcode == AMDGPU::DOT_4) {
2314 int OperandIdx[] = {
2315 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2316 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2317 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2318 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2319 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2320 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2321 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2322 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2323 };
2324 int NegIdx[] = {
2325 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2326 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2327 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2328 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2329 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2330 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2331 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2332 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2333 };
2334 int AbsIdx[] = {
2335 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2336 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2337 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2338 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2339 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2340 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2341 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2342 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2343 };
2344 for (unsigned i = 0; i < 8; i++) {
2345 if (OperandIdx[i] < 0)
2346 return Node;
2347 SDValue &Src = Ops[OperandIdx[i] - 1];
2348 SDValue &Neg = Ops[NegIdx[i] - 1];
2349 SDValue &Abs = Ops[AbsIdx[i] - 1];
2350 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2351 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2352 if (HasDst)
2353 SelIdx--;
2354 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2355 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2356 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2357 }
2358 } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2359 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2360 SDValue &Src = Ops[i];
2361 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2362 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2363 }
2364 } else if (Opcode == AMDGPU::CLAMP_R600) {
2365 SDValue Src = Node->getOperand(0);
2366 if (!Src.isMachineOpcode() ||
2367 !TII->hasInstrModifiers(Src.getMachineOpcode()))
2368 return Node;
2369 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2370 AMDGPU::OpName::clamp);
2371 if (ClampIdx < 0)
2372 return Node;
2373 SDLoc DL(Node);
2374 std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2375 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2376 return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2377 Node->getVTList(), Ops);
2378 } else {
2379 if (!TII->hasInstrModifiers(Opcode))
2380 return Node;
2381 int OperandIdx[] = {
2382 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2383 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2384 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2385 };
2386 int NegIdx[] = {
2387 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2388 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2389 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2390 };
2391 int AbsIdx[] = {
2392 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2393 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2394 -1
2395 };
2396 for (unsigned i = 0; i < 3; i++) {
2397 if (OperandIdx[i] < 0)
2398 return Node;
2399 SDValue &Src = Ops[OperandIdx[i] - 1];
2400 SDValue &Neg = Ops[NegIdx[i] - 1];
2401 SDValue FakeAbs;
2402 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2403 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2404 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2405 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2406 if (HasDst) {
2407 SelIdx--;
2408 ImmIdx--;
2409 }
2410 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2411 SDValue &Imm = Ops[ImmIdx];
2412 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2413 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2414 }
2415 }
2416
2417 return Node;
2418 }
2419