1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines an instruction selector for the NVPTX target.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "NVPTXISelDAGToDAG.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/Support/AtomicOrdering.h"
20 #include "llvm/Support/CommandLine.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/ErrorHandling.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include "llvm/Target/TargetIntrinsicInfo.h"
25
26 using namespace llvm;
27
28 #define DEBUG_TYPE "nvptx-isel"
29
30 /// createNVPTXISelDag - This pass converts a legalized DAG into a
31 /// NVPTX-specific DAG, ready for instruction scheduling.
createNVPTXISelDag(NVPTXTargetMachine & TM,llvm::CodeGenOpt::Level OptLevel)32 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
33 llvm::CodeGenOpt::Level OptLevel) {
34 return new NVPTXDAGToDAGISel(TM, OptLevel);
35 }
36
NVPTXDAGToDAGISel(NVPTXTargetMachine & tm,CodeGenOpt::Level OptLevel)37 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
38 CodeGenOpt::Level OptLevel)
39 : SelectionDAGISel(tm, OptLevel), TM(tm) {
40 doMulWide = (OptLevel > 0);
41 }
42
runOnMachineFunction(MachineFunction & MF)43 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
44 Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
45 return SelectionDAGISel::runOnMachineFunction(MF);
46 }
47
getDivF32Level() const48 int NVPTXDAGToDAGISel::getDivF32Level() const {
49 return Subtarget->getTargetLowering()->getDivF32Level();
50 }
51
usePrecSqrtF32() const52 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
53 return Subtarget->getTargetLowering()->usePrecSqrtF32();
54 }
55
useF32FTZ() const56 bool NVPTXDAGToDAGISel::useF32FTZ() const {
57 return Subtarget->getTargetLowering()->useF32FTZ(*MF);
58 }
59
allowFMA() const60 bool NVPTXDAGToDAGISel::allowFMA() const {
61 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
62 return TL->allowFMA(*MF, OptLevel);
63 }
64
allowUnsafeFPMath() const65 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
66 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
67 return TL->allowUnsafeFPMath(*MF);
68 }
69
useShortPointers() const70 bool NVPTXDAGToDAGISel::useShortPointers() const {
71 return TM.useShortPointers();
72 }
73
74 /// Select - Select instructions not customized! Used for
75 /// expanded, promoted and normal instructions.
Select(SDNode * N)76 void NVPTXDAGToDAGISel::Select(SDNode *N) {
77
78 if (N->isMachineOpcode()) {
79 N->setNodeId(-1);
80 return; // Already selected.
81 }
82
83 switch (N->getOpcode()) {
84 case ISD::LOAD:
85 case ISD::ATOMIC_LOAD:
86 if (tryLoad(N))
87 return;
88 break;
89 case ISD::STORE:
90 case ISD::ATOMIC_STORE:
91 if (tryStore(N))
92 return;
93 break;
94 case ISD::EXTRACT_VECTOR_ELT:
95 if (tryEXTRACT_VECTOR_ELEMENT(N))
96 return;
97 break;
98 case NVPTXISD::SETP_F16X2:
99 SelectSETP_F16X2(N);
100 return;
101
102 case NVPTXISD::LoadV2:
103 case NVPTXISD::LoadV4:
104 if (tryLoadVector(N))
105 return;
106 break;
107 case NVPTXISD::LDGV2:
108 case NVPTXISD::LDGV4:
109 case NVPTXISD::LDUV2:
110 case NVPTXISD::LDUV4:
111 if (tryLDGLDU(N))
112 return;
113 break;
114 case NVPTXISD::StoreV2:
115 case NVPTXISD::StoreV4:
116 if (tryStoreVector(N))
117 return;
118 break;
119 case NVPTXISD::LoadParam:
120 case NVPTXISD::LoadParamV2:
121 case NVPTXISD::LoadParamV4:
122 if (tryLoadParam(N))
123 return;
124 break;
125 case NVPTXISD::StoreRetval:
126 case NVPTXISD::StoreRetvalV2:
127 case NVPTXISD::StoreRetvalV4:
128 if (tryStoreRetval(N))
129 return;
130 break;
131 case NVPTXISD::StoreParam:
132 case NVPTXISD::StoreParamV2:
133 case NVPTXISD::StoreParamV4:
134 case NVPTXISD::StoreParamS32:
135 case NVPTXISD::StoreParamU32:
136 if (tryStoreParam(N))
137 return;
138 break;
139 case ISD::INTRINSIC_WO_CHAIN:
140 if (tryIntrinsicNoChain(N))
141 return;
142 break;
143 case ISD::INTRINSIC_W_CHAIN:
144 if (tryIntrinsicChain(N))
145 return;
146 break;
147 case NVPTXISD::Tex1DFloatS32:
148 case NVPTXISD::Tex1DFloatFloat:
149 case NVPTXISD::Tex1DFloatFloatLevel:
150 case NVPTXISD::Tex1DFloatFloatGrad:
151 case NVPTXISD::Tex1DS32S32:
152 case NVPTXISD::Tex1DS32Float:
153 case NVPTXISD::Tex1DS32FloatLevel:
154 case NVPTXISD::Tex1DS32FloatGrad:
155 case NVPTXISD::Tex1DU32S32:
156 case NVPTXISD::Tex1DU32Float:
157 case NVPTXISD::Tex1DU32FloatLevel:
158 case NVPTXISD::Tex1DU32FloatGrad:
159 case NVPTXISD::Tex1DArrayFloatS32:
160 case NVPTXISD::Tex1DArrayFloatFloat:
161 case NVPTXISD::Tex1DArrayFloatFloatLevel:
162 case NVPTXISD::Tex1DArrayFloatFloatGrad:
163 case NVPTXISD::Tex1DArrayS32S32:
164 case NVPTXISD::Tex1DArrayS32Float:
165 case NVPTXISD::Tex1DArrayS32FloatLevel:
166 case NVPTXISD::Tex1DArrayS32FloatGrad:
167 case NVPTXISD::Tex1DArrayU32S32:
168 case NVPTXISD::Tex1DArrayU32Float:
169 case NVPTXISD::Tex1DArrayU32FloatLevel:
170 case NVPTXISD::Tex1DArrayU32FloatGrad:
171 case NVPTXISD::Tex2DFloatS32:
172 case NVPTXISD::Tex2DFloatFloat:
173 case NVPTXISD::Tex2DFloatFloatLevel:
174 case NVPTXISD::Tex2DFloatFloatGrad:
175 case NVPTXISD::Tex2DS32S32:
176 case NVPTXISD::Tex2DS32Float:
177 case NVPTXISD::Tex2DS32FloatLevel:
178 case NVPTXISD::Tex2DS32FloatGrad:
179 case NVPTXISD::Tex2DU32S32:
180 case NVPTXISD::Tex2DU32Float:
181 case NVPTXISD::Tex2DU32FloatLevel:
182 case NVPTXISD::Tex2DU32FloatGrad:
183 case NVPTXISD::Tex2DArrayFloatS32:
184 case NVPTXISD::Tex2DArrayFloatFloat:
185 case NVPTXISD::Tex2DArrayFloatFloatLevel:
186 case NVPTXISD::Tex2DArrayFloatFloatGrad:
187 case NVPTXISD::Tex2DArrayS32S32:
188 case NVPTXISD::Tex2DArrayS32Float:
189 case NVPTXISD::Tex2DArrayS32FloatLevel:
190 case NVPTXISD::Tex2DArrayS32FloatGrad:
191 case NVPTXISD::Tex2DArrayU32S32:
192 case NVPTXISD::Tex2DArrayU32Float:
193 case NVPTXISD::Tex2DArrayU32FloatLevel:
194 case NVPTXISD::Tex2DArrayU32FloatGrad:
195 case NVPTXISD::Tex3DFloatS32:
196 case NVPTXISD::Tex3DFloatFloat:
197 case NVPTXISD::Tex3DFloatFloatLevel:
198 case NVPTXISD::Tex3DFloatFloatGrad:
199 case NVPTXISD::Tex3DS32S32:
200 case NVPTXISD::Tex3DS32Float:
201 case NVPTXISD::Tex3DS32FloatLevel:
202 case NVPTXISD::Tex3DS32FloatGrad:
203 case NVPTXISD::Tex3DU32S32:
204 case NVPTXISD::Tex3DU32Float:
205 case NVPTXISD::Tex3DU32FloatLevel:
206 case NVPTXISD::Tex3DU32FloatGrad:
207 case NVPTXISD::TexCubeFloatFloat:
208 case NVPTXISD::TexCubeFloatFloatLevel:
209 case NVPTXISD::TexCubeS32Float:
210 case NVPTXISD::TexCubeS32FloatLevel:
211 case NVPTXISD::TexCubeU32Float:
212 case NVPTXISD::TexCubeU32FloatLevel:
213 case NVPTXISD::TexCubeArrayFloatFloat:
214 case NVPTXISD::TexCubeArrayFloatFloatLevel:
215 case NVPTXISD::TexCubeArrayS32Float:
216 case NVPTXISD::TexCubeArrayS32FloatLevel:
217 case NVPTXISD::TexCubeArrayU32Float:
218 case NVPTXISD::TexCubeArrayU32FloatLevel:
219 case NVPTXISD::Tld4R2DFloatFloat:
220 case NVPTXISD::Tld4G2DFloatFloat:
221 case NVPTXISD::Tld4B2DFloatFloat:
222 case NVPTXISD::Tld4A2DFloatFloat:
223 case NVPTXISD::Tld4R2DS64Float:
224 case NVPTXISD::Tld4G2DS64Float:
225 case NVPTXISD::Tld4B2DS64Float:
226 case NVPTXISD::Tld4A2DS64Float:
227 case NVPTXISD::Tld4R2DU64Float:
228 case NVPTXISD::Tld4G2DU64Float:
229 case NVPTXISD::Tld4B2DU64Float:
230 case NVPTXISD::Tld4A2DU64Float:
231 case NVPTXISD::TexUnified1DFloatS32:
232 case NVPTXISD::TexUnified1DFloatFloat:
233 case NVPTXISD::TexUnified1DFloatFloatLevel:
234 case NVPTXISD::TexUnified1DFloatFloatGrad:
235 case NVPTXISD::TexUnified1DS32S32:
236 case NVPTXISD::TexUnified1DS32Float:
237 case NVPTXISD::TexUnified1DS32FloatLevel:
238 case NVPTXISD::TexUnified1DS32FloatGrad:
239 case NVPTXISD::TexUnified1DU32S32:
240 case NVPTXISD::TexUnified1DU32Float:
241 case NVPTXISD::TexUnified1DU32FloatLevel:
242 case NVPTXISD::TexUnified1DU32FloatGrad:
243 case NVPTXISD::TexUnified1DArrayFloatS32:
244 case NVPTXISD::TexUnified1DArrayFloatFloat:
245 case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
246 case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
247 case NVPTXISD::TexUnified1DArrayS32S32:
248 case NVPTXISD::TexUnified1DArrayS32Float:
249 case NVPTXISD::TexUnified1DArrayS32FloatLevel:
250 case NVPTXISD::TexUnified1DArrayS32FloatGrad:
251 case NVPTXISD::TexUnified1DArrayU32S32:
252 case NVPTXISD::TexUnified1DArrayU32Float:
253 case NVPTXISD::TexUnified1DArrayU32FloatLevel:
254 case NVPTXISD::TexUnified1DArrayU32FloatGrad:
255 case NVPTXISD::TexUnified2DFloatS32:
256 case NVPTXISD::TexUnified2DFloatFloat:
257 case NVPTXISD::TexUnified2DFloatFloatLevel:
258 case NVPTXISD::TexUnified2DFloatFloatGrad:
259 case NVPTXISD::TexUnified2DS32S32:
260 case NVPTXISD::TexUnified2DS32Float:
261 case NVPTXISD::TexUnified2DS32FloatLevel:
262 case NVPTXISD::TexUnified2DS32FloatGrad:
263 case NVPTXISD::TexUnified2DU32S32:
264 case NVPTXISD::TexUnified2DU32Float:
265 case NVPTXISD::TexUnified2DU32FloatLevel:
266 case NVPTXISD::TexUnified2DU32FloatGrad:
267 case NVPTXISD::TexUnified2DArrayFloatS32:
268 case NVPTXISD::TexUnified2DArrayFloatFloat:
269 case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
270 case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
271 case NVPTXISD::TexUnified2DArrayS32S32:
272 case NVPTXISD::TexUnified2DArrayS32Float:
273 case NVPTXISD::TexUnified2DArrayS32FloatLevel:
274 case NVPTXISD::TexUnified2DArrayS32FloatGrad:
275 case NVPTXISD::TexUnified2DArrayU32S32:
276 case NVPTXISD::TexUnified2DArrayU32Float:
277 case NVPTXISD::TexUnified2DArrayU32FloatLevel:
278 case NVPTXISD::TexUnified2DArrayU32FloatGrad:
279 case NVPTXISD::TexUnified3DFloatS32:
280 case NVPTXISD::TexUnified3DFloatFloat:
281 case NVPTXISD::TexUnified3DFloatFloatLevel:
282 case NVPTXISD::TexUnified3DFloatFloatGrad:
283 case NVPTXISD::TexUnified3DS32S32:
284 case NVPTXISD::TexUnified3DS32Float:
285 case NVPTXISD::TexUnified3DS32FloatLevel:
286 case NVPTXISD::TexUnified3DS32FloatGrad:
287 case NVPTXISD::TexUnified3DU32S32:
288 case NVPTXISD::TexUnified3DU32Float:
289 case NVPTXISD::TexUnified3DU32FloatLevel:
290 case NVPTXISD::TexUnified3DU32FloatGrad:
291 case NVPTXISD::TexUnifiedCubeFloatFloat:
292 case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
293 case NVPTXISD::TexUnifiedCubeS32Float:
294 case NVPTXISD::TexUnifiedCubeS32FloatLevel:
295 case NVPTXISD::TexUnifiedCubeU32Float:
296 case NVPTXISD::TexUnifiedCubeU32FloatLevel:
297 case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
298 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
299 case NVPTXISD::TexUnifiedCubeArrayS32Float:
300 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
301 case NVPTXISD::TexUnifiedCubeArrayU32Float:
302 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
303 case NVPTXISD::Tld4UnifiedR2DFloatFloat:
304 case NVPTXISD::Tld4UnifiedG2DFloatFloat:
305 case NVPTXISD::Tld4UnifiedB2DFloatFloat:
306 case NVPTXISD::Tld4UnifiedA2DFloatFloat:
307 case NVPTXISD::Tld4UnifiedR2DS64Float:
308 case NVPTXISD::Tld4UnifiedG2DS64Float:
309 case NVPTXISD::Tld4UnifiedB2DS64Float:
310 case NVPTXISD::Tld4UnifiedA2DS64Float:
311 case NVPTXISD::Tld4UnifiedR2DU64Float:
312 case NVPTXISD::Tld4UnifiedG2DU64Float:
313 case NVPTXISD::Tld4UnifiedB2DU64Float:
314 case NVPTXISD::Tld4UnifiedA2DU64Float:
315 if (tryTextureIntrinsic(N))
316 return;
317 break;
318 case NVPTXISD::Suld1DI8Clamp:
319 case NVPTXISD::Suld1DI16Clamp:
320 case NVPTXISD::Suld1DI32Clamp:
321 case NVPTXISD::Suld1DI64Clamp:
322 case NVPTXISD::Suld1DV2I8Clamp:
323 case NVPTXISD::Suld1DV2I16Clamp:
324 case NVPTXISD::Suld1DV2I32Clamp:
325 case NVPTXISD::Suld1DV2I64Clamp:
326 case NVPTXISD::Suld1DV4I8Clamp:
327 case NVPTXISD::Suld1DV4I16Clamp:
328 case NVPTXISD::Suld1DV4I32Clamp:
329 case NVPTXISD::Suld1DArrayI8Clamp:
330 case NVPTXISD::Suld1DArrayI16Clamp:
331 case NVPTXISD::Suld1DArrayI32Clamp:
332 case NVPTXISD::Suld1DArrayI64Clamp:
333 case NVPTXISD::Suld1DArrayV2I8Clamp:
334 case NVPTXISD::Suld1DArrayV2I16Clamp:
335 case NVPTXISD::Suld1DArrayV2I32Clamp:
336 case NVPTXISD::Suld1DArrayV2I64Clamp:
337 case NVPTXISD::Suld1DArrayV4I8Clamp:
338 case NVPTXISD::Suld1DArrayV4I16Clamp:
339 case NVPTXISD::Suld1DArrayV4I32Clamp:
340 case NVPTXISD::Suld2DI8Clamp:
341 case NVPTXISD::Suld2DI16Clamp:
342 case NVPTXISD::Suld2DI32Clamp:
343 case NVPTXISD::Suld2DI64Clamp:
344 case NVPTXISD::Suld2DV2I8Clamp:
345 case NVPTXISD::Suld2DV2I16Clamp:
346 case NVPTXISD::Suld2DV2I32Clamp:
347 case NVPTXISD::Suld2DV2I64Clamp:
348 case NVPTXISD::Suld2DV4I8Clamp:
349 case NVPTXISD::Suld2DV4I16Clamp:
350 case NVPTXISD::Suld2DV4I32Clamp:
351 case NVPTXISD::Suld2DArrayI8Clamp:
352 case NVPTXISD::Suld2DArrayI16Clamp:
353 case NVPTXISD::Suld2DArrayI32Clamp:
354 case NVPTXISD::Suld2DArrayI64Clamp:
355 case NVPTXISD::Suld2DArrayV2I8Clamp:
356 case NVPTXISD::Suld2DArrayV2I16Clamp:
357 case NVPTXISD::Suld2DArrayV2I32Clamp:
358 case NVPTXISD::Suld2DArrayV2I64Clamp:
359 case NVPTXISD::Suld2DArrayV4I8Clamp:
360 case NVPTXISD::Suld2DArrayV4I16Clamp:
361 case NVPTXISD::Suld2DArrayV4I32Clamp:
362 case NVPTXISD::Suld3DI8Clamp:
363 case NVPTXISD::Suld3DI16Clamp:
364 case NVPTXISD::Suld3DI32Clamp:
365 case NVPTXISD::Suld3DI64Clamp:
366 case NVPTXISD::Suld3DV2I8Clamp:
367 case NVPTXISD::Suld3DV2I16Clamp:
368 case NVPTXISD::Suld3DV2I32Clamp:
369 case NVPTXISD::Suld3DV2I64Clamp:
370 case NVPTXISD::Suld3DV4I8Clamp:
371 case NVPTXISD::Suld3DV4I16Clamp:
372 case NVPTXISD::Suld3DV4I32Clamp:
373 case NVPTXISD::Suld1DI8Trap:
374 case NVPTXISD::Suld1DI16Trap:
375 case NVPTXISD::Suld1DI32Trap:
376 case NVPTXISD::Suld1DI64Trap:
377 case NVPTXISD::Suld1DV2I8Trap:
378 case NVPTXISD::Suld1DV2I16Trap:
379 case NVPTXISD::Suld1DV2I32Trap:
380 case NVPTXISD::Suld1DV2I64Trap:
381 case NVPTXISD::Suld1DV4I8Trap:
382 case NVPTXISD::Suld1DV4I16Trap:
383 case NVPTXISD::Suld1DV4I32Trap:
384 case NVPTXISD::Suld1DArrayI8Trap:
385 case NVPTXISD::Suld1DArrayI16Trap:
386 case NVPTXISD::Suld1DArrayI32Trap:
387 case NVPTXISD::Suld1DArrayI64Trap:
388 case NVPTXISD::Suld1DArrayV2I8Trap:
389 case NVPTXISD::Suld1DArrayV2I16Trap:
390 case NVPTXISD::Suld1DArrayV2I32Trap:
391 case NVPTXISD::Suld1DArrayV2I64Trap:
392 case NVPTXISD::Suld1DArrayV4I8Trap:
393 case NVPTXISD::Suld1DArrayV4I16Trap:
394 case NVPTXISD::Suld1DArrayV4I32Trap:
395 case NVPTXISD::Suld2DI8Trap:
396 case NVPTXISD::Suld2DI16Trap:
397 case NVPTXISD::Suld2DI32Trap:
398 case NVPTXISD::Suld2DI64Trap:
399 case NVPTXISD::Suld2DV2I8Trap:
400 case NVPTXISD::Suld2DV2I16Trap:
401 case NVPTXISD::Suld2DV2I32Trap:
402 case NVPTXISD::Suld2DV2I64Trap:
403 case NVPTXISD::Suld2DV4I8Trap:
404 case NVPTXISD::Suld2DV4I16Trap:
405 case NVPTXISD::Suld2DV4I32Trap:
406 case NVPTXISD::Suld2DArrayI8Trap:
407 case NVPTXISD::Suld2DArrayI16Trap:
408 case NVPTXISD::Suld2DArrayI32Trap:
409 case NVPTXISD::Suld2DArrayI64Trap:
410 case NVPTXISD::Suld2DArrayV2I8Trap:
411 case NVPTXISD::Suld2DArrayV2I16Trap:
412 case NVPTXISD::Suld2DArrayV2I32Trap:
413 case NVPTXISD::Suld2DArrayV2I64Trap:
414 case NVPTXISD::Suld2DArrayV4I8Trap:
415 case NVPTXISD::Suld2DArrayV4I16Trap:
416 case NVPTXISD::Suld2DArrayV4I32Trap:
417 case NVPTXISD::Suld3DI8Trap:
418 case NVPTXISD::Suld3DI16Trap:
419 case NVPTXISD::Suld3DI32Trap:
420 case NVPTXISD::Suld3DI64Trap:
421 case NVPTXISD::Suld3DV2I8Trap:
422 case NVPTXISD::Suld3DV2I16Trap:
423 case NVPTXISD::Suld3DV2I32Trap:
424 case NVPTXISD::Suld3DV2I64Trap:
425 case NVPTXISD::Suld3DV4I8Trap:
426 case NVPTXISD::Suld3DV4I16Trap:
427 case NVPTXISD::Suld3DV4I32Trap:
428 case NVPTXISD::Suld1DI8Zero:
429 case NVPTXISD::Suld1DI16Zero:
430 case NVPTXISD::Suld1DI32Zero:
431 case NVPTXISD::Suld1DI64Zero:
432 case NVPTXISD::Suld1DV2I8Zero:
433 case NVPTXISD::Suld1DV2I16Zero:
434 case NVPTXISD::Suld1DV2I32Zero:
435 case NVPTXISD::Suld1DV2I64Zero:
436 case NVPTXISD::Suld1DV4I8Zero:
437 case NVPTXISD::Suld1DV4I16Zero:
438 case NVPTXISD::Suld1DV4I32Zero:
439 case NVPTXISD::Suld1DArrayI8Zero:
440 case NVPTXISD::Suld1DArrayI16Zero:
441 case NVPTXISD::Suld1DArrayI32Zero:
442 case NVPTXISD::Suld1DArrayI64Zero:
443 case NVPTXISD::Suld1DArrayV2I8Zero:
444 case NVPTXISD::Suld1DArrayV2I16Zero:
445 case NVPTXISD::Suld1DArrayV2I32Zero:
446 case NVPTXISD::Suld1DArrayV2I64Zero:
447 case NVPTXISD::Suld1DArrayV4I8Zero:
448 case NVPTXISD::Suld1DArrayV4I16Zero:
449 case NVPTXISD::Suld1DArrayV4I32Zero:
450 case NVPTXISD::Suld2DI8Zero:
451 case NVPTXISD::Suld2DI16Zero:
452 case NVPTXISD::Suld2DI32Zero:
453 case NVPTXISD::Suld2DI64Zero:
454 case NVPTXISD::Suld2DV2I8Zero:
455 case NVPTXISD::Suld2DV2I16Zero:
456 case NVPTXISD::Suld2DV2I32Zero:
457 case NVPTXISD::Suld2DV2I64Zero:
458 case NVPTXISD::Suld2DV4I8Zero:
459 case NVPTXISD::Suld2DV4I16Zero:
460 case NVPTXISD::Suld2DV4I32Zero:
461 case NVPTXISD::Suld2DArrayI8Zero:
462 case NVPTXISD::Suld2DArrayI16Zero:
463 case NVPTXISD::Suld2DArrayI32Zero:
464 case NVPTXISD::Suld2DArrayI64Zero:
465 case NVPTXISD::Suld2DArrayV2I8Zero:
466 case NVPTXISD::Suld2DArrayV2I16Zero:
467 case NVPTXISD::Suld2DArrayV2I32Zero:
468 case NVPTXISD::Suld2DArrayV2I64Zero:
469 case NVPTXISD::Suld2DArrayV4I8Zero:
470 case NVPTXISD::Suld2DArrayV4I16Zero:
471 case NVPTXISD::Suld2DArrayV4I32Zero:
472 case NVPTXISD::Suld3DI8Zero:
473 case NVPTXISD::Suld3DI16Zero:
474 case NVPTXISD::Suld3DI32Zero:
475 case NVPTXISD::Suld3DI64Zero:
476 case NVPTXISD::Suld3DV2I8Zero:
477 case NVPTXISD::Suld3DV2I16Zero:
478 case NVPTXISD::Suld3DV2I32Zero:
479 case NVPTXISD::Suld3DV2I64Zero:
480 case NVPTXISD::Suld3DV4I8Zero:
481 case NVPTXISD::Suld3DV4I16Zero:
482 case NVPTXISD::Suld3DV4I32Zero:
483 if (trySurfaceIntrinsic(N))
484 return;
485 break;
486 case ISD::AND:
487 case ISD::SRA:
488 case ISD::SRL:
489 // Try to select BFE
490 if (tryBFE(N))
491 return;
492 break;
493 case ISD::ADDRSPACECAST:
494 SelectAddrSpaceCast(N);
495 return;
496 case ISD::ConstantFP:
497 if (tryConstantFP16(N))
498 return;
499 break;
500 default:
501 break;
502 }
503 SelectCode(N);
504 }
505
tryIntrinsicChain(SDNode * N)506 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
507 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
508 switch (IID) {
509 default:
510 return false;
511 case Intrinsic::nvvm_ldg_global_f:
512 case Intrinsic::nvvm_ldg_global_i:
513 case Intrinsic::nvvm_ldg_global_p:
514 case Intrinsic::nvvm_ldu_global_f:
515 case Intrinsic::nvvm_ldu_global_i:
516 case Intrinsic::nvvm_ldu_global_p:
517 return tryLDGLDU(N);
518 }
519 }
520
521 // There's no way to specify FP16 immediates in .f16 ops, so we have to
522 // load them into an .f16 register first.
tryConstantFP16(SDNode * N)523 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
524 if (N->getValueType(0) != MVT::f16)
525 return false;
526 SDValue Val = CurDAG->getTargetConstantFP(
527 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
528 SDNode *LoadConstF16 =
529 CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
530 ReplaceNode(N, LoadConstF16);
531 return true;
532 }
533
534 // Map ISD:CONDCODE value to appropriate CmpMode expected by
535 // NVPTXInstPrinter::printCmpMode()
getPTXCmpMode(const CondCodeSDNode & CondCode,bool FTZ)536 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
537 using NVPTX::PTXCmpMode::CmpMode;
538 unsigned PTXCmpMode = [](ISD::CondCode CC) {
539 switch (CC) {
540 default:
541 llvm_unreachable("Unexpected condition code.");
542 case ISD::SETOEQ:
543 return CmpMode::EQ;
544 case ISD::SETOGT:
545 return CmpMode::GT;
546 case ISD::SETOGE:
547 return CmpMode::GE;
548 case ISD::SETOLT:
549 return CmpMode::LT;
550 case ISD::SETOLE:
551 return CmpMode::LE;
552 case ISD::SETONE:
553 return CmpMode::NE;
554 case ISD::SETO:
555 return CmpMode::NUM;
556 case ISD::SETUO:
557 return CmpMode::NotANumber;
558 case ISD::SETUEQ:
559 return CmpMode::EQU;
560 case ISD::SETUGT:
561 return CmpMode::GTU;
562 case ISD::SETUGE:
563 return CmpMode::GEU;
564 case ISD::SETULT:
565 return CmpMode::LTU;
566 case ISD::SETULE:
567 return CmpMode::LEU;
568 case ISD::SETUNE:
569 return CmpMode::NEU;
570 case ISD::SETEQ:
571 return CmpMode::EQ;
572 case ISD::SETGT:
573 return CmpMode::GT;
574 case ISD::SETGE:
575 return CmpMode::GE;
576 case ISD::SETLT:
577 return CmpMode::LT;
578 case ISD::SETLE:
579 return CmpMode::LE;
580 case ISD::SETNE:
581 return CmpMode::NE;
582 }
583 }(CondCode.get());
584
585 if (FTZ)
586 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
587
588 return PTXCmpMode;
589 }
590
SelectSETP_F16X2(SDNode * N)591 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
592 unsigned PTXCmpMode =
593 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
594 SDLoc DL(N);
595 SDNode *SetP = CurDAG->getMachineNode(
596 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
597 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
598 ReplaceNode(N, SetP);
599 return true;
600 }
601
602 // Find all instances of extract_vector_elt that use this v2f16 vector
603 // and coalesce them into a scattering move instruction.
tryEXTRACT_VECTOR_ELEMENT(SDNode * N)604 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
605 SDValue Vector = N->getOperand(0);
606
607 // We only care about f16x2 as it's the only real vector type we
608 // need to deal with.
609 if (Vector.getSimpleValueType() != MVT::v2f16)
610 return false;
611
612 // Find and record all uses of this vector that extract element 0 or 1.
613 SmallVector<SDNode *, 4> E0, E1;
614 for (const auto &U : Vector.getNode()->uses()) {
615 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
616 continue;
617 if (U->getOperand(0) != Vector)
618 continue;
619 if (const ConstantSDNode *IdxConst =
620 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
621 if (IdxConst->getZExtValue() == 0)
622 E0.push_back(U);
623 else if (IdxConst->getZExtValue() == 1)
624 E1.push_back(U);
625 else
626 llvm_unreachable("Invalid vector index.");
627 }
628 }
629
630 // There's no point scattering f16x2 if we only ever access one
631 // element of it.
632 if (E0.empty() || E1.empty())
633 return false;
634
635 unsigned Op = NVPTX::SplitF16x2;
636 // If the vector has been BITCAST'ed from i32, we can use original
637 // value directly and avoid register-to-register move.
638 SDValue Source = Vector;
639 if (Vector->getOpcode() == ISD::BITCAST) {
640 Op = NVPTX::SplitI32toF16x2;
641 Source = Vector->getOperand(0);
642 }
643 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
644 // into f16,f16 SplitF16x2(V)
645 SDNode *ScatterOp =
646 CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
647 for (auto *Node : E0)
648 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
649 for (auto *Node : E1)
650 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
651
652 return true;
653 }
654
getCodeAddrSpace(MemSDNode * N)655 static unsigned int getCodeAddrSpace(MemSDNode *N) {
656 const Value *Src = N->getMemOperand()->getValue();
657
658 if (!Src)
659 return NVPTX::PTXLdStInstCode::GENERIC;
660
661 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
662 switch (PT->getAddressSpace()) {
663 case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
664 case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
665 case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
666 case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
667 case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
668 case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
669 default: break;
670 }
671 }
672 return NVPTX::PTXLdStInstCode::GENERIC;
673 }
674
canLowerToLDG(MemSDNode * N,const NVPTXSubtarget & Subtarget,unsigned CodeAddrSpace,MachineFunction * F)675 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
676 unsigned CodeAddrSpace, MachineFunction *F) {
677 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
678 // space.
679 //
680 // We have two ways of identifying invariant loads: Loads may be explicitly
681 // marked as invariant, or we may infer them to be invariant.
682 //
683 // We currently infer invariance for loads from
684 // - constant global variables, and
685 // - kernel function pointer params that are noalias (i.e. __restrict) and
686 // never written to.
687 //
688 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
689 // not during the SelectionDAG phase).
690 //
691 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
692 // explicitly invariant loads because these are how clang tells us to use ldg
693 // when the user uses a builtin.
694 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
695 return false;
696
697 if (N->isInvariant())
698 return true;
699
700 bool IsKernelFn = isKernelFunction(F->getFunction());
701
702 // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
703 // because the former looks through phi nodes while the latter does not. We
704 // need to look through phi nodes to handle pointer induction variables.
705 SmallVector<Value *, 8> Objs;
706 GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
707 Objs, F->getDataLayout());
708
709 return all_of(Objs, [&](Value *V) {
710 if (auto *A = dyn_cast<const Argument>(V))
711 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
712 if (auto *GV = dyn_cast<const GlobalVariable>(V))
713 return GV->isConstant();
714 return false;
715 });
716 }
717
tryIntrinsicNoChain(SDNode * N)718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
719 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
720 switch (IID) {
721 default:
722 return false;
723 case Intrinsic::nvvm_texsurf_handle_internal:
724 SelectTexSurfHandle(N);
725 return true;
726 }
727 }
728
SelectTexSurfHandle(SDNode * N)729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
730 // Op 0 is the intrinsic ID
731 SDValue Wrapper = N->getOperand(1);
732 SDValue GlobalVal = Wrapper.getOperand(0);
733 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
734 MVT::i64, GlobalVal));
735 }
736
SelectAddrSpaceCast(SDNode * N)737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
738 SDValue Src = N->getOperand(0);
739 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
740 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
741 unsigned DstAddrSpace = CastN->getDestAddressSpace();
742 assert(SrcAddrSpace != DstAddrSpace &&
743 "addrspacecast must be between different address spaces");
744
745 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
746 // Specific to generic
747 unsigned Opc;
748 switch (SrcAddrSpace) {
749 default: report_fatal_error("Bad address space in addrspacecast");
750 case ADDRESS_SPACE_GLOBAL:
751 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
752 break;
753 case ADDRESS_SPACE_SHARED:
754 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
755 : NVPTX::cvta_shared_yes_64)
756 : NVPTX::cvta_shared_yes;
757 break;
758 case ADDRESS_SPACE_CONST:
759 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
760 : NVPTX::cvta_const_yes_64)
761 : NVPTX::cvta_const_yes;
762 break;
763 case ADDRESS_SPACE_LOCAL:
764 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
765 : NVPTX::cvta_local_yes_64)
766 : NVPTX::cvta_local_yes;
767 break;
768 }
769 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
770 Src));
771 return;
772 } else {
773 // Generic to specific
774 if (SrcAddrSpace != 0)
775 report_fatal_error("Cannot cast between two non-generic address spaces");
776 unsigned Opc;
777 switch (DstAddrSpace) {
778 default: report_fatal_error("Bad address space in addrspacecast");
779 case ADDRESS_SPACE_GLOBAL:
780 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
781 : NVPTX::cvta_to_global_yes;
782 break;
783 case ADDRESS_SPACE_SHARED:
784 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
785 : NVPTX::cvta_to_shared_yes_64)
786 : NVPTX::cvta_to_shared_yes;
787 break;
788 case ADDRESS_SPACE_CONST:
789 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
790 : NVPTX::cvta_to_const_yes_64)
791 : NVPTX::cvta_to_const_yes;
792 break;
793 case ADDRESS_SPACE_LOCAL:
794 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
795 : NVPTX::cvta_to_local_yes_64)
796 : NVPTX::cvta_to_local_yes;
797 break;
798 case ADDRESS_SPACE_PARAM:
799 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
800 : NVPTX::nvvm_ptr_gen_to_param;
801 break;
802 }
803 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
804 Src));
805 return;
806 }
807 }
808
809 // Helper function template to reduce amount of boilerplate code for
810 // opcode selection.
pickOpcodeForVT(MVT::SimpleValueType VT,unsigned Opcode_i8,unsigned Opcode_i16,unsigned Opcode_i32,Optional<unsigned> Opcode_i64,unsigned Opcode_f16,unsigned Opcode_f16x2,unsigned Opcode_f32,Optional<unsigned> Opcode_f64)811 static Optional<unsigned> pickOpcodeForVT(
812 MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
813 unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
814 unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
815 switch (VT) {
816 case MVT::i1:
817 case MVT::i8:
818 return Opcode_i8;
819 case MVT::i16:
820 return Opcode_i16;
821 case MVT::i32:
822 return Opcode_i32;
823 case MVT::i64:
824 return Opcode_i64;
825 case MVT::f16:
826 return Opcode_f16;
827 case MVT::v2f16:
828 return Opcode_f16x2;
829 case MVT::f32:
830 return Opcode_f32;
831 case MVT::f64:
832 return Opcode_f64;
833 default:
834 return None;
835 }
836 }
837
tryLoad(SDNode * N)838 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
839 SDLoc dl(N);
840 MemSDNode *LD = cast<MemSDNode>(N);
841 assert(LD->readMem() && "Expected load");
842 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
843 EVT LoadedVT = LD->getMemoryVT();
844 SDNode *NVPTXLD = nullptr;
845
846 // do not support pre/post inc/dec
847 if (PlainLoad && PlainLoad->isIndexed())
848 return false;
849
850 if (!LoadedVT.isSimple())
851 return false;
852
853 AtomicOrdering Ordering = LD->getOrdering();
854 // In order to lower atomic loads with stronger guarantees we would need to
855 // use load.acquire or insert fences. However these features were only added
856 // with PTX ISA 6.0 / sm_70.
857 // TODO: Check if we can actually use the new instructions and implement them.
858 if (isStrongerThanMonotonic(Ordering))
859 return false;
860
861 // Address Space Setting
862 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
863 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
864 return tryLDGLDU(N);
865 }
866
867 unsigned int PointerSize =
868 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
869
870 // Volatile Setting
871 // - .volatile is only available for .global and .shared
872 // - .volatile has the same memory synchronization semantics as .relaxed.sys
873 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
874 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
875 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
876 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
877 isVolatile = false;
878
879 // Type Setting: fromType + fromTypeWidth
880 //
881 // Sign : ISD::SEXTLOAD
882 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
883 // type is integer
884 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
885 MVT SimpleVT = LoadedVT.getSimpleVT();
886 MVT ScalarVT = SimpleVT.getScalarType();
887 // Read at least 8 bits (predicates are stored as 8-bit values)
888 unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
889 unsigned int fromType;
890
891 // Vector Setting
892 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
893 if (SimpleVT.isVector()) {
894 assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
895 // v2f16 is loaded using ld.b32
896 fromTypeWidth = 32;
897 }
898
899 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
900 fromType = NVPTX::PTXLdStInstCode::Signed;
901 else if (ScalarVT.isFloatingPoint())
902 // f16 uses .b16 as its storage type.
903 fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
904 : NVPTX::PTXLdStInstCode::Float;
905 else
906 fromType = NVPTX::PTXLdStInstCode::Unsigned;
907
908 // Create the machine instruction DAG
909 SDValue Chain = N->getOperand(0);
910 SDValue N1 = N->getOperand(1);
911 SDValue Addr;
912 SDValue Offset, Base;
913 Optional<unsigned> Opcode;
914 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
915
916 if (SelectDirectAddr(N1, Addr)) {
917 Opcode = pickOpcodeForVT(
918 TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
919 NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
920 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
921 if (!Opcode)
922 return false;
923 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
924 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
925 getI32Imm(fromTypeWidth, dl), Addr, Chain };
926 NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
927 MVT::Other, Ops);
928 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
929 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
930 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
931 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
932 NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
933 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
934 if (!Opcode)
935 return false;
936 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
937 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
938 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
939 NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
940 MVT::Other, Ops);
941 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
942 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
943 if (PointerSize == 64)
944 Opcode = pickOpcodeForVT(
945 TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
946 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
947 NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
948 else
949 Opcode = pickOpcodeForVT(
950 TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
951 NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
952 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
953 if (!Opcode)
954 return false;
955 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
956 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
957 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
958 NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
959 MVT::Other, Ops);
960 } else {
961 if (PointerSize == 64)
962 Opcode = pickOpcodeForVT(
963 TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
964 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
965 NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
966 NVPTX::LD_f64_areg_64);
967 else
968 Opcode = pickOpcodeForVT(
969 TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
970 NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
971 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
972 if (!Opcode)
973 return false;
974 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
975 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
976 getI32Imm(fromTypeWidth, dl), N1, Chain };
977 NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
978 MVT::Other, Ops);
979 }
980
981 if (!NVPTXLD)
982 return false;
983
984 MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
985 MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
986 cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
987
988 ReplaceNode(N, NVPTXLD);
989 return true;
990 }
991
tryLoadVector(SDNode * N)992 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
993
994 SDValue Chain = N->getOperand(0);
995 SDValue Op1 = N->getOperand(1);
996 SDValue Addr, Offset, Base;
997 Optional<unsigned> Opcode;
998 SDLoc DL(N);
999 SDNode *LD;
1000 MemSDNode *MemSD = cast<MemSDNode>(N);
1001 EVT LoadedVT = MemSD->getMemoryVT();
1002
1003 if (!LoadedVT.isSimple())
1004 return false;
1005
1006 // Address Space Setting
1007 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1008 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1009 return tryLDGLDU(N);
1010 }
1011
1012 unsigned int PointerSize =
1013 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1014
1015 // Volatile Setting
1016 // - .volatile is only availalble for .global and .shared
1017 bool IsVolatile = MemSD->isVolatile();
1018 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1019 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1020 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1021 IsVolatile = false;
1022
1023 // Vector Setting
1024 MVT SimpleVT = LoadedVT.getSimpleVT();
1025
1026 // Type Setting: fromType + fromTypeWidth
1027 //
1028 // Sign : ISD::SEXTLOAD
1029 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1030 // type is integer
1031 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1032 MVT ScalarVT = SimpleVT.getScalarType();
1033 // Read at least 8 bits (predicates are stored as 8-bit values)
1034 unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
1035 unsigned int FromType;
1036 // The last operand holds the original LoadSDNode::getExtensionType() value
1037 unsigned ExtensionType = cast<ConstantSDNode>(
1038 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1039 if (ExtensionType == ISD::SEXTLOAD)
1040 FromType = NVPTX::PTXLdStInstCode::Signed;
1041 else if (ScalarVT.isFloatingPoint())
1042 FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1043 : NVPTX::PTXLdStInstCode::Float;
1044 else
1045 FromType = NVPTX::PTXLdStInstCode::Unsigned;
1046
1047 unsigned VecType;
1048
1049 switch (N->getOpcode()) {
1050 case NVPTXISD::LoadV2:
1051 VecType = NVPTX::PTXLdStInstCode::V2;
1052 break;
1053 case NVPTXISD::LoadV4:
1054 VecType = NVPTX::PTXLdStInstCode::V4;
1055 break;
1056 default:
1057 return false;
1058 }
1059
1060 EVT EltVT = N->getValueType(0);
1061
1062 // v8f16 is a special case. PTX doesn't have ld.v8.f16
1063 // instruction. Instead, we split the vector into v2f16 chunks and
1064 // load them with ld.v4.b32.
1065 if (EltVT == MVT::v2f16) {
1066 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1067 EltVT = MVT::i32;
1068 FromType = NVPTX::PTXLdStInstCode::Untyped;
1069 FromTypeWidth = 32;
1070 }
1071
1072 if (SelectDirectAddr(Op1, Addr)) {
1073 switch (N->getOpcode()) {
1074 default:
1075 return false;
1076 case NVPTXISD::LoadV2:
1077 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1078 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1079 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1080 NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1081 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1082 break;
1083 case NVPTXISD::LoadV4:
1084 Opcode =
1085 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1086 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
1087 NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1088 NVPTX::LDV_f32_v4_avar, None);
1089 break;
1090 }
1091 if (!Opcode)
1092 return false;
1093 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1094 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1095 getI32Imm(FromTypeWidth, DL), Addr, Chain };
1096 LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1097 } else if (PointerSize == 64
1098 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1099 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1100 switch (N->getOpcode()) {
1101 default:
1102 return false;
1103 case NVPTXISD::LoadV2:
1104 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1105 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1106 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1107 NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1108 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1109 break;
1110 case NVPTXISD::LoadV4:
1111 Opcode =
1112 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1113 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
1114 NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1115 NVPTX::LDV_f32_v4_asi, None);
1116 break;
1117 }
1118 if (!Opcode)
1119 return false;
1120 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1121 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1122 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1123 LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1124 } else if (PointerSize == 64
1125 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1126 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1127 if (PointerSize == 64) {
1128 switch (N->getOpcode()) {
1129 default:
1130 return false;
1131 case NVPTXISD::LoadV2:
1132 Opcode = pickOpcodeForVT(
1133 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1134 NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1135 NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1136 NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1137 NVPTX::LDV_f64_v2_ari_64);
1138 break;
1139 case NVPTXISD::LoadV4:
1140 Opcode = pickOpcodeForVT(
1141 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1142 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
1143 NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1144 NVPTX::LDV_f32_v4_ari_64, None);
1145 break;
1146 }
1147 } else {
1148 switch (N->getOpcode()) {
1149 default:
1150 return false;
1151 case NVPTXISD::LoadV2:
1152 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1153 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1154 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1155 NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1156 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1157 break;
1158 case NVPTXISD::LoadV4:
1159 Opcode =
1160 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1161 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
1162 NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1163 NVPTX::LDV_f32_v4_ari, None);
1164 break;
1165 }
1166 }
1167 if (!Opcode)
1168 return false;
1169 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1170 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1171 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1172
1173 LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1174 } else {
1175 if (PointerSize == 64) {
1176 switch (N->getOpcode()) {
1177 default:
1178 return false;
1179 case NVPTXISD::LoadV2:
1180 Opcode = pickOpcodeForVT(
1181 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1182 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1183 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1184 NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1185 NVPTX::LDV_f64_v2_areg_64);
1186 break;
1187 case NVPTXISD::LoadV4:
1188 Opcode = pickOpcodeForVT(
1189 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1190 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
1191 NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1192 NVPTX::LDV_f32_v4_areg_64, None);
1193 break;
1194 }
1195 } else {
1196 switch (N->getOpcode()) {
1197 default:
1198 return false;
1199 case NVPTXISD::LoadV2:
1200 Opcode =
1201 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1202 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1203 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1204 NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1205 NVPTX::LDV_f64_v2_areg);
1206 break;
1207 case NVPTXISD::LoadV4:
1208 Opcode = pickOpcodeForVT(
1209 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1210 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
1211 NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1212 NVPTX::LDV_f32_v4_areg, None);
1213 break;
1214 }
1215 }
1216 if (!Opcode)
1217 return false;
1218 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1219 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1220 getI32Imm(FromTypeWidth, DL), Op1, Chain };
1221 LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1222 }
1223
1224 MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
1225 MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
1226 cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
1227
1228 ReplaceNode(N, LD);
1229 return true;
1230 }
1231
tryLDGLDU(SDNode * N)1232 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1233
1234 SDValue Chain = N->getOperand(0);
1235 SDValue Op1;
1236 MemSDNode *Mem;
1237 bool IsLDG = true;
1238
1239 // If this is an LDG intrinsic, the address is the third operand. If its an
1240 // LDG/LDU SD node (from custom vector handling), then its the second operand
1241 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1242 Op1 = N->getOperand(2);
1243 Mem = cast<MemIntrinsicSDNode>(N);
1244 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1245 switch (IID) {
1246 default:
1247 return false;
1248 case Intrinsic::nvvm_ldg_global_f:
1249 case Intrinsic::nvvm_ldg_global_i:
1250 case Intrinsic::nvvm_ldg_global_p:
1251 IsLDG = true;
1252 break;
1253 case Intrinsic::nvvm_ldu_global_f:
1254 case Intrinsic::nvvm_ldu_global_i:
1255 case Intrinsic::nvvm_ldu_global_p:
1256 IsLDG = false;
1257 break;
1258 }
1259 } else {
1260 Op1 = N->getOperand(1);
1261 Mem = cast<MemSDNode>(N);
1262 }
1263
1264 Optional<unsigned> Opcode;
1265 SDLoc DL(N);
1266 SDNode *LD;
1267 SDValue Base, Offset, Addr;
1268
1269 EVT EltVT = Mem->getMemoryVT();
1270 unsigned NumElts = 1;
1271 if (EltVT.isVector()) {
1272 NumElts = EltVT.getVectorNumElements();
1273 EltVT = EltVT.getVectorElementType();
1274 // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1275 if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1276 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1277 EltVT = MVT::v2f16;
1278 NumElts /= 2;
1279 }
1280 }
1281
1282 // Build the "promoted" result VTList for the load. If we are really loading
1283 // i8s, then the return type will be promoted to i16 since we do not expose
1284 // 8-bit registers in NVPTX.
1285 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1286 SmallVector<EVT, 5> InstVTs;
1287 for (unsigned i = 0; i != NumElts; ++i) {
1288 InstVTs.push_back(NodeVT);
1289 }
1290 InstVTs.push_back(MVT::Other);
1291 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1292
1293 if (SelectDirectAddr(Op1, Addr)) {
1294 switch (N->getOpcode()) {
1295 default:
1296 return false;
1297 case ISD::LOAD:
1298 case ISD::INTRINSIC_W_CHAIN:
1299 if (IsLDG)
1300 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1301 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1302 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1303 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1304 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1305 NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1306 NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1307 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1308 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1309 else
1310 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1311 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1312 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1313 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1314 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1315 NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1316 NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1317 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1318 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1319 break;
1320 case NVPTXISD::LoadV2:
1321 case NVPTXISD::LDGV2:
1322 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1323 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1324 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1325 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1326 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1327 NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1328 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1329 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1330 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1331 break;
1332 case NVPTXISD::LDUV2:
1333 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1334 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1335 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1336 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1337 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1338 NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1339 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1340 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1341 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1342 break;
1343 case NVPTXISD::LoadV4:
1344 case NVPTXISD::LDGV4:
1345 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1346 NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1347 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1348 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
1349 NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1350 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1351 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
1352 break;
1353 case NVPTXISD::LDUV4:
1354 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1355 NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1356 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1357 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
1358 NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1359 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1360 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
1361 break;
1362 }
1363 if (!Opcode)
1364 return false;
1365 SDValue Ops[] = { Addr, Chain };
1366 LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1367 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1368 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1369 if (TM.is64Bit()) {
1370 switch (N->getOpcode()) {
1371 default:
1372 return false;
1373 case ISD::LOAD:
1374 case ISD::INTRINSIC_W_CHAIN:
1375 if (IsLDG)
1376 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1377 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1378 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1379 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1380 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1381 NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1382 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1383 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1384 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1385 else
1386 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1387 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1388 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1389 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1390 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1391 NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1392 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1393 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1394 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1395 break;
1396 case NVPTXISD::LoadV2:
1397 case NVPTXISD::LDGV2:
1398 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1399 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1400 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1401 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1402 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1403 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1404 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1405 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1406 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1407 break;
1408 case NVPTXISD::LDUV2:
1409 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1410 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1411 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1412 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1413 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1414 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1415 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1416 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1417 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1418 break;
1419 case NVPTXISD::LoadV4:
1420 case NVPTXISD::LDGV4:
1421 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1422 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1423 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1424 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
1425 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1426 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1427 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
1428 break;
1429 case NVPTXISD::LDUV4:
1430 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1431 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1432 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1433 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
1434 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1435 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1436 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
1437 break;
1438 }
1439 } else {
1440 switch (N->getOpcode()) {
1441 default:
1442 return false;
1443 case ISD::LOAD:
1444 case ISD::INTRINSIC_W_CHAIN:
1445 if (IsLDG)
1446 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1447 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1448 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1449 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1450 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1451 NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1452 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1453 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1454 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1455 else
1456 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1457 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1458 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1459 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1460 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1461 NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1462 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1463 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1464 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1465 break;
1466 case NVPTXISD::LoadV2:
1467 case NVPTXISD::LDGV2:
1468 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1469 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1470 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1471 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1472 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1473 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1474 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1475 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1476 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1477 break;
1478 case NVPTXISD::LDUV2:
1479 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1480 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1481 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1482 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1483 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1484 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1485 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1486 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1487 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1488 break;
1489 case NVPTXISD::LoadV4:
1490 case NVPTXISD::LDGV4:
1491 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1492 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1493 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1494 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
1495 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1496 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1497 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
1498 break;
1499 case NVPTXISD::LDUV4:
1500 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1501 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1502 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1503 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
1504 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1505 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1506 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
1507 break;
1508 }
1509 }
1510 if (!Opcode)
1511 return false;
1512 SDValue Ops[] = {Base, Offset, Chain};
1513 LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1514 } else {
1515 if (TM.is64Bit()) {
1516 switch (N->getOpcode()) {
1517 default:
1518 return false;
1519 case ISD::LOAD:
1520 case ISD::INTRINSIC_W_CHAIN:
1521 if (IsLDG)
1522 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1523 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1524 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1525 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1526 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1527 NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1528 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1529 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1530 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1531 else
1532 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1533 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1534 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1535 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1536 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1537 NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1538 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1539 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1540 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1541 break;
1542 case NVPTXISD::LoadV2:
1543 case NVPTXISD::LDGV2:
1544 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1545 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1546 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1547 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1548 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1549 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1550 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1551 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1552 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1553 break;
1554 case NVPTXISD::LDUV2:
1555 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1556 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1557 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1558 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1559 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1560 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1561 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1562 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1563 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1564 break;
1565 case NVPTXISD::LoadV4:
1566 case NVPTXISD::LDGV4:
1567 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1568 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1569 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1570 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
1571 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1572 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1573 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
1574 break;
1575 case NVPTXISD::LDUV4:
1576 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1577 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1578 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1579 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
1580 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1581 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1582 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
1583 break;
1584 }
1585 } else {
1586 switch (N->getOpcode()) {
1587 default:
1588 return false;
1589 case ISD::LOAD:
1590 case ISD::INTRINSIC_W_CHAIN:
1591 if (IsLDG)
1592 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1593 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1594 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1595 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1596 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1597 NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1598 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1599 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1600 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1601 else
1602 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1603 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1604 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1605 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1606 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1607 NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1608 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1609 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1610 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1611 break;
1612 case NVPTXISD::LoadV2:
1613 case NVPTXISD::LDGV2:
1614 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1615 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1616 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1617 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1618 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1619 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1620 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1621 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1622 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1623 break;
1624 case NVPTXISD::LDUV2:
1625 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1626 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1627 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1628 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1629 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1630 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1631 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1632 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1633 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1634 break;
1635 case NVPTXISD::LoadV4:
1636 case NVPTXISD::LDGV4:
1637 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1638 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1639 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1640 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
1641 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1642 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1643 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
1644 break;
1645 case NVPTXISD::LDUV4:
1646 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1647 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1648 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1649 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
1650 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1651 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1652 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
1653 break;
1654 }
1655 }
1656 if (!Opcode)
1657 return false;
1658 SDValue Ops[] = { Op1, Chain };
1659 LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1660 }
1661
1662 MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
1663 MemRefs0[0] = Mem->getMemOperand();
1664 cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
1665
1666 // For automatic generation of LDG (through SelectLoad[Vector], not the
1667 // intrinsics), we may have an extending load like:
1668 //
1669 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1670 //
1671 // In this case, the matching logic above will select a load for the original
1672 // memory type (in this case, i8) and our types will not match (the node needs
1673 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1674 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1675 // CVT instruction. Ptxas should clean up any redundancies here.
1676
1677 EVT OrigType = N->getValueType(0);
1678 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1679
1680 if (OrigType != EltVT && LdNode) {
1681 // We have an extending-load. The instruction we selected operates on the
1682 // smaller type, but the SDNode we are replacing has the larger type. We
1683 // need to emit a CVT to make the types match.
1684 bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1685 unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1686 EltVT.getSimpleVT(), IsSigned);
1687
1688 // For each output value, apply the manual sign/zero-extension and make sure
1689 // all users of the load go through that CVT.
1690 for (unsigned i = 0; i != NumElts; ++i) {
1691 SDValue Res(LD, i);
1692 SDValue OrigVal(N, i);
1693
1694 SDNode *CvtNode =
1695 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1696 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1697 DL, MVT::i32));
1698 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1699 }
1700 }
1701
1702 ReplaceNode(N, LD);
1703 return true;
1704 }
1705
tryStore(SDNode * N)1706 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1707 SDLoc dl(N);
1708 MemSDNode *ST = cast<MemSDNode>(N);
1709 assert(ST->writeMem() && "Expected store");
1710 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1711 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1712 assert((PlainStore || AtomicStore) && "Expected store");
1713 EVT StoreVT = ST->getMemoryVT();
1714 SDNode *NVPTXST = nullptr;
1715
1716 // do not support pre/post inc/dec
1717 if (PlainStore && PlainStore->isIndexed())
1718 return false;
1719
1720 if (!StoreVT.isSimple())
1721 return false;
1722
1723 AtomicOrdering Ordering = ST->getOrdering();
1724 // In order to lower atomic loads with stronger guarantees we would need to
1725 // use store.release or insert fences. However these features were only added
1726 // with PTX ISA 6.0 / sm_70.
1727 // TODO: Check if we can actually use the new instructions and implement them.
1728 if (isStrongerThanMonotonic(Ordering))
1729 return false;
1730
1731 // Address Space Setting
1732 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1733 unsigned int PointerSize =
1734 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1735
1736 // Volatile Setting
1737 // - .volatile is only available for .global and .shared
1738 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1739 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1740 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1741 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1742 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1743 isVolatile = false;
1744
1745 // Vector Setting
1746 MVT SimpleVT = StoreVT.getSimpleVT();
1747 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1748
1749 // Type Setting: toType + toTypeWidth
1750 // - for integer type, always use 'u'
1751 //
1752 MVT ScalarVT = SimpleVT.getScalarType();
1753 unsigned toTypeWidth = ScalarVT.getSizeInBits();
1754 if (SimpleVT.isVector()) {
1755 assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
1756 // v2f16 is stored using st.b32
1757 toTypeWidth = 32;
1758 }
1759
1760 unsigned int toType;
1761 if (ScalarVT.isFloatingPoint())
1762 // f16 uses .b16 as its storage type.
1763 toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1764 : NVPTX::PTXLdStInstCode::Float;
1765 else
1766 toType = NVPTX::PTXLdStInstCode::Unsigned;
1767
1768 // Create the machine instruction DAG
1769 SDValue Chain = ST->getChain();
1770 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1771 SDValue BasePtr = ST->getBasePtr();
1772 SDValue Addr;
1773 SDValue Offset, Base;
1774 Optional<unsigned> Opcode;
1775 MVT::SimpleValueType SourceVT =
1776 Value.getNode()->getSimpleValueType(0).SimpleTy;
1777
1778 if (SelectDirectAddr(BasePtr, Addr)) {
1779 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1780 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1781 NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1782 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1783 if (!Opcode)
1784 return false;
1785 SDValue Ops[] = {Value,
1786 getI32Imm(isVolatile, dl),
1787 getI32Imm(CodeAddrSpace, dl),
1788 getI32Imm(vecType, dl),
1789 getI32Imm(toType, dl),
1790 getI32Imm(toTypeWidth, dl),
1791 Addr,
1792 Chain};
1793 NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1794 } else if (PointerSize == 64
1795 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1796 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1797 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1798 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1799 NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1800 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1801 if (!Opcode)
1802 return false;
1803 SDValue Ops[] = {Value,
1804 getI32Imm(isVolatile, dl),
1805 getI32Imm(CodeAddrSpace, dl),
1806 getI32Imm(vecType, dl),
1807 getI32Imm(toType, dl),
1808 getI32Imm(toTypeWidth, dl),
1809 Base,
1810 Offset,
1811 Chain};
1812 NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1813 } else if (PointerSize == 64
1814 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1815 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1816 if (PointerSize == 64)
1817 Opcode = pickOpcodeForVT(
1818 SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1819 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1820 NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1821 else
1822 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1823 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1824 NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1825 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1826 if (!Opcode)
1827 return false;
1828
1829 SDValue Ops[] = {Value,
1830 getI32Imm(isVolatile, dl),
1831 getI32Imm(CodeAddrSpace, dl),
1832 getI32Imm(vecType, dl),
1833 getI32Imm(toType, dl),
1834 getI32Imm(toTypeWidth, dl),
1835 Base,
1836 Offset,
1837 Chain};
1838 NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1839 } else {
1840 if (PointerSize == 64)
1841 Opcode =
1842 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1843 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1844 NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1845 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1846 else
1847 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1848 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1849 NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1850 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1851 if (!Opcode)
1852 return false;
1853 SDValue Ops[] = {Value,
1854 getI32Imm(isVolatile, dl),
1855 getI32Imm(CodeAddrSpace, dl),
1856 getI32Imm(vecType, dl),
1857 getI32Imm(toType, dl),
1858 getI32Imm(toTypeWidth, dl),
1859 BasePtr,
1860 Chain};
1861 NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1862 }
1863
1864 if (!NVPTXST)
1865 return false;
1866
1867 MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
1868 MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
1869 cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
1870 ReplaceNode(N, NVPTXST);
1871 return true;
1872 }
1873
tryStoreVector(SDNode * N)1874 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1875 SDValue Chain = N->getOperand(0);
1876 SDValue Op1 = N->getOperand(1);
1877 SDValue Addr, Offset, Base;
1878 Optional<unsigned> Opcode;
1879 SDLoc DL(N);
1880 SDNode *ST;
1881 EVT EltVT = Op1.getValueType();
1882 MemSDNode *MemSD = cast<MemSDNode>(N);
1883 EVT StoreVT = MemSD->getMemoryVT();
1884
1885 // Address Space Setting
1886 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1887 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1888 report_fatal_error("Cannot store to pointer that points to constant "
1889 "memory space");
1890 }
1891 unsigned int PointerSize =
1892 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1893
1894 // Volatile Setting
1895 // - .volatile is only availalble for .global and .shared
1896 bool IsVolatile = MemSD->isVolatile();
1897 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1898 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1899 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1900 IsVolatile = false;
1901
1902 // Type Setting: toType + toTypeWidth
1903 // - for integer type, always use 'u'
1904 assert(StoreVT.isSimple() && "Store value is not simple");
1905 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1906 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1907 unsigned ToType;
1908 if (ScalarVT.isFloatingPoint())
1909 ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1910 : NVPTX::PTXLdStInstCode::Float;
1911 else
1912 ToType = NVPTX::PTXLdStInstCode::Unsigned;
1913
1914 SmallVector<SDValue, 12> StOps;
1915 SDValue N2;
1916 unsigned VecType;
1917
1918 switch (N->getOpcode()) {
1919 case NVPTXISD::StoreV2:
1920 VecType = NVPTX::PTXLdStInstCode::V2;
1921 StOps.push_back(N->getOperand(1));
1922 StOps.push_back(N->getOperand(2));
1923 N2 = N->getOperand(3);
1924 break;
1925 case NVPTXISD::StoreV4:
1926 VecType = NVPTX::PTXLdStInstCode::V4;
1927 StOps.push_back(N->getOperand(1));
1928 StOps.push_back(N->getOperand(2));
1929 StOps.push_back(N->getOperand(3));
1930 StOps.push_back(N->getOperand(4));
1931 N2 = N->getOperand(5);
1932 break;
1933 default:
1934 return false;
1935 }
1936
1937 // v8f16 is a special case. PTX doesn't have st.v8.f16
1938 // instruction. Instead, we split the vector into v2f16 chunks and
1939 // store them with st.v4.b32.
1940 if (EltVT == MVT::v2f16) {
1941 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1942 EltVT = MVT::i32;
1943 ToType = NVPTX::PTXLdStInstCode::Untyped;
1944 ToTypeWidth = 32;
1945 }
1946
1947 StOps.push_back(getI32Imm(IsVolatile, DL));
1948 StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1949 StOps.push_back(getI32Imm(VecType, DL));
1950 StOps.push_back(getI32Imm(ToType, DL));
1951 StOps.push_back(getI32Imm(ToTypeWidth, DL));
1952
1953 if (SelectDirectAddr(N2, Addr)) {
1954 switch (N->getOpcode()) {
1955 default:
1956 return false;
1957 case NVPTXISD::StoreV2:
1958 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1959 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1960 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1961 NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1962 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1963 break;
1964 case NVPTXISD::StoreV4:
1965 Opcode =
1966 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
1967 NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
1968 NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1969 NVPTX::STV_f32_v4_avar, None);
1970 break;
1971 }
1972 StOps.push_back(Addr);
1973 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1974 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1975 switch (N->getOpcode()) {
1976 default:
1977 return false;
1978 case NVPTXISD::StoreV2:
1979 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1980 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1981 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1982 NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1983 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1984 break;
1985 case NVPTXISD::StoreV4:
1986 Opcode =
1987 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1988 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
1989 NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1990 NVPTX::STV_f32_v4_asi, None);
1991 break;
1992 }
1993 StOps.push_back(Base);
1994 StOps.push_back(Offset);
1995 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1996 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1997 if (PointerSize == 64) {
1998 switch (N->getOpcode()) {
1999 default:
2000 return false;
2001 case NVPTXISD::StoreV2:
2002 Opcode = pickOpcodeForVT(
2003 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2004 NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2005 NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2006 NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2007 NVPTX::STV_f64_v2_ari_64);
2008 break;
2009 case NVPTXISD::StoreV4:
2010 Opcode = pickOpcodeForVT(
2011 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2012 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
2013 NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2014 NVPTX::STV_f32_v4_ari_64, None);
2015 break;
2016 }
2017 } else {
2018 switch (N->getOpcode()) {
2019 default:
2020 return false;
2021 case NVPTXISD::StoreV2:
2022 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2023 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2024 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2025 NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2026 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2027 break;
2028 case NVPTXISD::StoreV4:
2029 Opcode =
2030 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
2031 NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
2032 NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2033 NVPTX::STV_f32_v4_ari, None);
2034 break;
2035 }
2036 }
2037 StOps.push_back(Base);
2038 StOps.push_back(Offset);
2039 } else {
2040 if (PointerSize == 64) {
2041 switch (N->getOpcode()) {
2042 default:
2043 return false;
2044 case NVPTXISD::StoreV2:
2045 Opcode = pickOpcodeForVT(
2046 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2047 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2048 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2049 NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2050 NVPTX::STV_f64_v2_areg_64);
2051 break;
2052 case NVPTXISD::StoreV4:
2053 Opcode = pickOpcodeForVT(
2054 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2055 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
2056 NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2057 NVPTX::STV_f32_v4_areg_64, None);
2058 break;
2059 }
2060 } else {
2061 switch (N->getOpcode()) {
2062 default:
2063 return false;
2064 case NVPTXISD::StoreV2:
2065 Opcode =
2066 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2067 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2068 NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2069 NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2070 NVPTX::STV_f64_v2_areg);
2071 break;
2072 case NVPTXISD::StoreV4:
2073 Opcode =
2074 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2075 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
2076 NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2077 NVPTX::STV_f32_v4_areg, None);
2078 break;
2079 }
2080 }
2081 StOps.push_back(N2);
2082 }
2083
2084 if (!Opcode)
2085 return false;
2086
2087 StOps.push_back(Chain);
2088
2089 ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
2090
2091 MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
2092 MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2093 cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
2094
2095 ReplaceNode(N, ST);
2096 return true;
2097 }
2098
tryLoadParam(SDNode * Node)2099 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2100 SDValue Chain = Node->getOperand(0);
2101 SDValue Offset = Node->getOperand(2);
2102 SDValue Flag = Node->getOperand(3);
2103 SDLoc DL(Node);
2104 MemSDNode *Mem = cast<MemSDNode>(Node);
2105
2106 unsigned VecSize;
2107 switch (Node->getOpcode()) {
2108 default:
2109 return false;
2110 case NVPTXISD::LoadParam:
2111 VecSize = 1;
2112 break;
2113 case NVPTXISD::LoadParamV2:
2114 VecSize = 2;
2115 break;
2116 case NVPTXISD::LoadParamV4:
2117 VecSize = 4;
2118 break;
2119 }
2120
2121 EVT EltVT = Node->getValueType(0);
2122 EVT MemVT = Mem->getMemoryVT();
2123
2124 Optional<unsigned> Opcode;
2125
2126 switch (VecSize) {
2127 default:
2128 return false;
2129 case 1:
2130 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2131 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2132 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2133 NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2134 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2135 break;
2136 case 2:
2137 Opcode =
2138 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2139 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2140 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2141 NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2142 NVPTX::LoadParamMemV2F64);
2143 break;
2144 case 4:
2145 Opcode = pickOpcodeForVT(
2146 MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2147 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
2148 NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2149 NVPTX::LoadParamMemV4F32, None);
2150 break;
2151 }
2152 if (!Opcode)
2153 return false;
2154
2155 SDVTList VTs;
2156 if (VecSize == 1) {
2157 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2158 } else if (VecSize == 2) {
2159 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2160 } else {
2161 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2162 VTs = CurDAG->getVTList(EVTs);
2163 }
2164
2165 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2166
2167 SmallVector<SDValue, 2> Ops;
2168 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2169 Ops.push_back(Chain);
2170 Ops.push_back(Flag);
2171
2172 ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
2173 return true;
2174 }
2175
tryStoreRetval(SDNode * N)2176 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2177 SDLoc DL(N);
2178 SDValue Chain = N->getOperand(0);
2179 SDValue Offset = N->getOperand(1);
2180 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2181 MemSDNode *Mem = cast<MemSDNode>(N);
2182
2183 // How many elements do we have?
2184 unsigned NumElts = 1;
2185 switch (N->getOpcode()) {
2186 default:
2187 return false;
2188 case NVPTXISD::StoreRetval:
2189 NumElts = 1;
2190 break;
2191 case NVPTXISD::StoreRetvalV2:
2192 NumElts = 2;
2193 break;
2194 case NVPTXISD::StoreRetvalV4:
2195 NumElts = 4;
2196 break;
2197 }
2198
2199 // Build vector of operands
2200 SmallVector<SDValue, 6> Ops;
2201 for (unsigned i = 0; i < NumElts; ++i)
2202 Ops.push_back(N->getOperand(i + 2));
2203 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2204 Ops.push_back(Chain);
2205
2206 // Determine target opcode
2207 // If we have an i1, use an 8-bit store. The lowering code in
2208 // NVPTXISelLowering will have already emitted an upcast.
2209 Optional<unsigned> Opcode = 0;
2210 switch (NumElts) {
2211 default:
2212 return false;
2213 case 1:
2214 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2215 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2216 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2217 NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2218 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2219 break;
2220 case 2:
2221 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2222 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2223 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2224 NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2225 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2226 break;
2227 case 4:
2228 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2229 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2230 NVPTX::StoreRetvalV4I32, None,
2231 NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2232 NVPTX::StoreRetvalV4F32, None);
2233 break;
2234 }
2235 if (!Opcode)
2236 return false;
2237
2238 SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
2239 MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
2240 MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2241 cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
2242
2243 ReplaceNode(N, Ret);
2244 return true;
2245 }
2246
tryStoreParam(SDNode * N)2247 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2248 SDLoc DL(N);
2249 SDValue Chain = N->getOperand(0);
2250 SDValue Param = N->getOperand(1);
2251 unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2252 SDValue Offset = N->getOperand(2);
2253 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2254 MemSDNode *Mem = cast<MemSDNode>(N);
2255 SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2256
2257 // How many elements do we have?
2258 unsigned NumElts = 1;
2259 switch (N->getOpcode()) {
2260 default:
2261 return false;
2262 case NVPTXISD::StoreParamU32:
2263 case NVPTXISD::StoreParamS32:
2264 case NVPTXISD::StoreParam:
2265 NumElts = 1;
2266 break;
2267 case NVPTXISD::StoreParamV2:
2268 NumElts = 2;
2269 break;
2270 case NVPTXISD::StoreParamV4:
2271 NumElts = 4;
2272 break;
2273 }
2274
2275 // Build vector of operands
2276 SmallVector<SDValue, 8> Ops;
2277 for (unsigned i = 0; i < NumElts; ++i)
2278 Ops.push_back(N->getOperand(i + 3));
2279 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2280 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2281 Ops.push_back(Chain);
2282 Ops.push_back(Flag);
2283
2284 // Determine target opcode
2285 // If we have an i1, use an 8-bit store. The lowering code in
2286 // NVPTXISelLowering will have already emitted an upcast.
2287 Optional<unsigned> Opcode = 0;
2288 switch (N->getOpcode()) {
2289 default:
2290 switch (NumElts) {
2291 default:
2292 return false;
2293 case 1:
2294 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2295 NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2296 NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2297 NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2298 NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2299 break;
2300 case 2:
2301 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2302 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2303 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2304 NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2305 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2306 break;
2307 case 4:
2308 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2309 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2310 NVPTX::StoreParamV4I32, None,
2311 NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2312 NVPTX::StoreParamV4F32, None);
2313 break;
2314 }
2315 if (!Opcode)
2316 return false;
2317 break;
2318 // Special case: if we have a sign-extend/zero-extend node, insert the
2319 // conversion instruction first, and use that as the value operand to
2320 // the selected StoreParam node.
2321 case NVPTXISD::StoreParamU32: {
2322 Opcode = NVPTX::StoreParamI32;
2323 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2324 MVT::i32);
2325 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2326 MVT::i32, Ops[0], CvtNone);
2327 Ops[0] = SDValue(Cvt, 0);
2328 break;
2329 }
2330 case NVPTXISD::StoreParamS32: {
2331 Opcode = NVPTX::StoreParamI32;
2332 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2333 MVT::i32);
2334 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2335 MVT::i32, Ops[0], CvtNone);
2336 Ops[0] = SDValue(Cvt, 0);
2337 break;
2338 }
2339 }
2340
2341 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2342 SDNode *Ret =
2343 CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
2344 MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
2345 MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2346 cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
2347
2348 ReplaceNode(N, Ret);
2349 return true;
2350 }
2351
tryTextureIntrinsic(SDNode * N)2352 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2353 unsigned Opc = 0;
2354
2355 switch (N->getOpcode()) {
2356 default: return false;
2357 case NVPTXISD::Tex1DFloatS32:
2358 Opc = NVPTX::TEX_1D_F32_S32;
2359 break;
2360 case NVPTXISD::Tex1DFloatFloat:
2361 Opc = NVPTX::TEX_1D_F32_F32;
2362 break;
2363 case NVPTXISD::Tex1DFloatFloatLevel:
2364 Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
2365 break;
2366 case NVPTXISD::Tex1DFloatFloatGrad:
2367 Opc = NVPTX::TEX_1D_F32_F32_GRAD;
2368 break;
2369 case NVPTXISD::Tex1DS32S32:
2370 Opc = NVPTX::TEX_1D_S32_S32;
2371 break;
2372 case NVPTXISD::Tex1DS32Float:
2373 Opc = NVPTX::TEX_1D_S32_F32;
2374 break;
2375 case NVPTXISD::Tex1DS32FloatLevel:
2376 Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
2377 break;
2378 case NVPTXISD::Tex1DS32FloatGrad:
2379 Opc = NVPTX::TEX_1D_S32_F32_GRAD;
2380 break;
2381 case NVPTXISD::Tex1DU32S32:
2382 Opc = NVPTX::TEX_1D_U32_S32;
2383 break;
2384 case NVPTXISD::Tex1DU32Float:
2385 Opc = NVPTX::TEX_1D_U32_F32;
2386 break;
2387 case NVPTXISD::Tex1DU32FloatLevel:
2388 Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
2389 break;
2390 case NVPTXISD::Tex1DU32FloatGrad:
2391 Opc = NVPTX::TEX_1D_U32_F32_GRAD;
2392 break;
2393 case NVPTXISD::Tex1DArrayFloatS32:
2394 Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
2395 break;
2396 case NVPTXISD::Tex1DArrayFloatFloat:
2397 Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
2398 break;
2399 case NVPTXISD::Tex1DArrayFloatFloatLevel:
2400 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
2401 break;
2402 case NVPTXISD::Tex1DArrayFloatFloatGrad:
2403 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
2404 break;
2405 case NVPTXISD::Tex1DArrayS32S32:
2406 Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
2407 break;
2408 case NVPTXISD::Tex1DArrayS32Float:
2409 Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
2410 break;
2411 case NVPTXISD::Tex1DArrayS32FloatLevel:
2412 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
2413 break;
2414 case NVPTXISD::Tex1DArrayS32FloatGrad:
2415 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
2416 break;
2417 case NVPTXISD::Tex1DArrayU32S32:
2418 Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
2419 break;
2420 case NVPTXISD::Tex1DArrayU32Float:
2421 Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
2422 break;
2423 case NVPTXISD::Tex1DArrayU32FloatLevel:
2424 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
2425 break;
2426 case NVPTXISD::Tex1DArrayU32FloatGrad:
2427 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
2428 break;
2429 case NVPTXISD::Tex2DFloatS32:
2430 Opc = NVPTX::TEX_2D_F32_S32;
2431 break;
2432 case NVPTXISD::Tex2DFloatFloat:
2433 Opc = NVPTX::TEX_2D_F32_F32;
2434 break;
2435 case NVPTXISD::Tex2DFloatFloatLevel:
2436 Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
2437 break;
2438 case NVPTXISD::Tex2DFloatFloatGrad:
2439 Opc = NVPTX::TEX_2D_F32_F32_GRAD;
2440 break;
2441 case NVPTXISD::Tex2DS32S32:
2442 Opc = NVPTX::TEX_2D_S32_S32;
2443 break;
2444 case NVPTXISD::Tex2DS32Float:
2445 Opc = NVPTX::TEX_2D_S32_F32;
2446 break;
2447 case NVPTXISD::Tex2DS32FloatLevel:
2448 Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
2449 break;
2450 case NVPTXISD::Tex2DS32FloatGrad:
2451 Opc = NVPTX::TEX_2D_S32_F32_GRAD;
2452 break;
2453 case NVPTXISD::Tex2DU32S32:
2454 Opc = NVPTX::TEX_2D_U32_S32;
2455 break;
2456 case NVPTXISD::Tex2DU32Float:
2457 Opc = NVPTX::TEX_2D_U32_F32;
2458 break;
2459 case NVPTXISD::Tex2DU32FloatLevel:
2460 Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
2461 break;
2462 case NVPTXISD::Tex2DU32FloatGrad:
2463 Opc = NVPTX::TEX_2D_U32_F32_GRAD;
2464 break;
2465 case NVPTXISD::Tex2DArrayFloatS32:
2466 Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
2467 break;
2468 case NVPTXISD::Tex2DArrayFloatFloat:
2469 Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
2470 break;
2471 case NVPTXISD::Tex2DArrayFloatFloatLevel:
2472 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
2473 break;
2474 case NVPTXISD::Tex2DArrayFloatFloatGrad:
2475 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
2476 break;
2477 case NVPTXISD::Tex2DArrayS32S32:
2478 Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
2479 break;
2480 case NVPTXISD::Tex2DArrayS32Float:
2481 Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
2482 break;
2483 case NVPTXISD::Tex2DArrayS32FloatLevel:
2484 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
2485 break;
2486 case NVPTXISD::Tex2DArrayS32FloatGrad:
2487 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
2488 break;
2489 case NVPTXISD::Tex2DArrayU32S32:
2490 Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
2491 break;
2492 case NVPTXISD::Tex2DArrayU32Float:
2493 Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
2494 break;
2495 case NVPTXISD::Tex2DArrayU32FloatLevel:
2496 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
2497 break;
2498 case NVPTXISD::Tex2DArrayU32FloatGrad:
2499 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
2500 break;
2501 case NVPTXISD::Tex3DFloatS32:
2502 Opc = NVPTX::TEX_3D_F32_S32;
2503 break;
2504 case NVPTXISD::Tex3DFloatFloat:
2505 Opc = NVPTX::TEX_3D_F32_F32;
2506 break;
2507 case NVPTXISD::Tex3DFloatFloatLevel:
2508 Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
2509 break;
2510 case NVPTXISD::Tex3DFloatFloatGrad:
2511 Opc = NVPTX::TEX_3D_F32_F32_GRAD;
2512 break;
2513 case NVPTXISD::Tex3DS32S32:
2514 Opc = NVPTX::TEX_3D_S32_S32;
2515 break;
2516 case NVPTXISD::Tex3DS32Float:
2517 Opc = NVPTX::TEX_3D_S32_F32;
2518 break;
2519 case NVPTXISD::Tex3DS32FloatLevel:
2520 Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
2521 break;
2522 case NVPTXISD::Tex3DS32FloatGrad:
2523 Opc = NVPTX::TEX_3D_S32_F32_GRAD;
2524 break;
2525 case NVPTXISD::Tex3DU32S32:
2526 Opc = NVPTX::TEX_3D_U32_S32;
2527 break;
2528 case NVPTXISD::Tex3DU32Float:
2529 Opc = NVPTX::TEX_3D_U32_F32;
2530 break;
2531 case NVPTXISD::Tex3DU32FloatLevel:
2532 Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
2533 break;
2534 case NVPTXISD::Tex3DU32FloatGrad:
2535 Opc = NVPTX::TEX_3D_U32_F32_GRAD;
2536 break;
2537 case NVPTXISD::TexCubeFloatFloat:
2538 Opc = NVPTX::TEX_CUBE_F32_F32;
2539 break;
2540 case NVPTXISD::TexCubeFloatFloatLevel:
2541 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
2542 break;
2543 case NVPTXISD::TexCubeS32Float:
2544 Opc = NVPTX::TEX_CUBE_S32_F32;
2545 break;
2546 case NVPTXISD::TexCubeS32FloatLevel:
2547 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
2548 break;
2549 case NVPTXISD::TexCubeU32Float:
2550 Opc = NVPTX::TEX_CUBE_U32_F32;
2551 break;
2552 case NVPTXISD::TexCubeU32FloatLevel:
2553 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
2554 break;
2555 case NVPTXISD::TexCubeArrayFloatFloat:
2556 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
2557 break;
2558 case NVPTXISD::TexCubeArrayFloatFloatLevel:
2559 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
2560 break;
2561 case NVPTXISD::TexCubeArrayS32Float:
2562 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
2563 break;
2564 case NVPTXISD::TexCubeArrayS32FloatLevel:
2565 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
2566 break;
2567 case NVPTXISD::TexCubeArrayU32Float:
2568 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
2569 break;
2570 case NVPTXISD::TexCubeArrayU32FloatLevel:
2571 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
2572 break;
2573 case NVPTXISD::Tld4R2DFloatFloat:
2574 Opc = NVPTX::TLD4_R_2D_F32_F32;
2575 break;
2576 case NVPTXISD::Tld4G2DFloatFloat:
2577 Opc = NVPTX::TLD4_G_2D_F32_F32;
2578 break;
2579 case NVPTXISD::Tld4B2DFloatFloat:
2580 Opc = NVPTX::TLD4_B_2D_F32_F32;
2581 break;
2582 case NVPTXISD::Tld4A2DFloatFloat:
2583 Opc = NVPTX::TLD4_A_2D_F32_F32;
2584 break;
2585 case NVPTXISD::Tld4R2DS64Float:
2586 Opc = NVPTX::TLD4_R_2D_S32_F32;
2587 break;
2588 case NVPTXISD::Tld4G2DS64Float:
2589 Opc = NVPTX::TLD4_G_2D_S32_F32;
2590 break;
2591 case NVPTXISD::Tld4B2DS64Float:
2592 Opc = NVPTX::TLD4_B_2D_S32_F32;
2593 break;
2594 case NVPTXISD::Tld4A2DS64Float:
2595 Opc = NVPTX::TLD4_A_2D_S32_F32;
2596 break;
2597 case NVPTXISD::Tld4R2DU64Float:
2598 Opc = NVPTX::TLD4_R_2D_U32_F32;
2599 break;
2600 case NVPTXISD::Tld4G2DU64Float:
2601 Opc = NVPTX::TLD4_G_2D_U32_F32;
2602 break;
2603 case NVPTXISD::Tld4B2DU64Float:
2604 Opc = NVPTX::TLD4_B_2D_U32_F32;
2605 break;
2606 case NVPTXISD::Tld4A2DU64Float:
2607 Opc = NVPTX::TLD4_A_2D_U32_F32;
2608 break;
2609 case NVPTXISD::TexUnified1DFloatS32:
2610 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
2611 break;
2612 case NVPTXISD::TexUnified1DFloatFloat:
2613 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
2614 break;
2615 case NVPTXISD::TexUnified1DFloatFloatLevel:
2616 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
2617 break;
2618 case NVPTXISD::TexUnified1DFloatFloatGrad:
2619 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
2620 break;
2621 case NVPTXISD::TexUnified1DS32S32:
2622 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
2623 break;
2624 case NVPTXISD::TexUnified1DS32Float:
2625 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
2626 break;
2627 case NVPTXISD::TexUnified1DS32FloatLevel:
2628 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
2629 break;
2630 case NVPTXISD::TexUnified1DS32FloatGrad:
2631 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
2632 break;
2633 case NVPTXISD::TexUnified1DU32S32:
2634 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
2635 break;
2636 case NVPTXISD::TexUnified1DU32Float:
2637 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
2638 break;
2639 case NVPTXISD::TexUnified1DU32FloatLevel:
2640 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
2641 break;
2642 case NVPTXISD::TexUnified1DU32FloatGrad:
2643 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
2644 break;
2645 case NVPTXISD::TexUnified1DArrayFloatS32:
2646 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
2647 break;
2648 case NVPTXISD::TexUnified1DArrayFloatFloat:
2649 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
2650 break;
2651 case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2652 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
2653 break;
2654 case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2655 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
2656 break;
2657 case NVPTXISD::TexUnified1DArrayS32S32:
2658 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
2659 break;
2660 case NVPTXISD::TexUnified1DArrayS32Float:
2661 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
2662 break;
2663 case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2664 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
2665 break;
2666 case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2667 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
2668 break;
2669 case NVPTXISD::TexUnified1DArrayU32S32:
2670 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
2671 break;
2672 case NVPTXISD::TexUnified1DArrayU32Float:
2673 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
2674 break;
2675 case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2676 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
2677 break;
2678 case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2679 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
2680 break;
2681 case NVPTXISD::TexUnified2DFloatS32:
2682 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
2683 break;
2684 case NVPTXISD::TexUnified2DFloatFloat:
2685 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
2686 break;
2687 case NVPTXISD::TexUnified2DFloatFloatLevel:
2688 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
2689 break;
2690 case NVPTXISD::TexUnified2DFloatFloatGrad:
2691 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
2692 break;
2693 case NVPTXISD::TexUnified2DS32S32:
2694 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
2695 break;
2696 case NVPTXISD::TexUnified2DS32Float:
2697 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
2698 break;
2699 case NVPTXISD::TexUnified2DS32FloatLevel:
2700 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
2701 break;
2702 case NVPTXISD::TexUnified2DS32FloatGrad:
2703 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
2704 break;
2705 case NVPTXISD::TexUnified2DU32S32:
2706 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
2707 break;
2708 case NVPTXISD::TexUnified2DU32Float:
2709 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
2710 break;
2711 case NVPTXISD::TexUnified2DU32FloatLevel:
2712 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
2713 break;
2714 case NVPTXISD::TexUnified2DU32FloatGrad:
2715 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
2716 break;
2717 case NVPTXISD::TexUnified2DArrayFloatS32:
2718 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
2719 break;
2720 case NVPTXISD::TexUnified2DArrayFloatFloat:
2721 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
2722 break;
2723 case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2724 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
2725 break;
2726 case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2727 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
2728 break;
2729 case NVPTXISD::TexUnified2DArrayS32S32:
2730 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
2731 break;
2732 case NVPTXISD::TexUnified2DArrayS32Float:
2733 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
2734 break;
2735 case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2736 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
2737 break;
2738 case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2739 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
2740 break;
2741 case NVPTXISD::TexUnified2DArrayU32S32:
2742 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
2743 break;
2744 case NVPTXISD::TexUnified2DArrayU32Float:
2745 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
2746 break;
2747 case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2748 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
2749 break;
2750 case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2751 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
2752 break;
2753 case NVPTXISD::TexUnified3DFloatS32:
2754 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
2755 break;
2756 case NVPTXISD::TexUnified3DFloatFloat:
2757 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
2758 break;
2759 case NVPTXISD::TexUnified3DFloatFloatLevel:
2760 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
2761 break;
2762 case NVPTXISD::TexUnified3DFloatFloatGrad:
2763 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
2764 break;
2765 case NVPTXISD::TexUnified3DS32S32:
2766 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
2767 break;
2768 case NVPTXISD::TexUnified3DS32Float:
2769 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
2770 break;
2771 case NVPTXISD::TexUnified3DS32FloatLevel:
2772 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
2773 break;
2774 case NVPTXISD::TexUnified3DS32FloatGrad:
2775 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
2776 break;
2777 case NVPTXISD::TexUnified3DU32S32:
2778 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
2779 break;
2780 case NVPTXISD::TexUnified3DU32Float:
2781 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
2782 break;
2783 case NVPTXISD::TexUnified3DU32FloatLevel:
2784 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
2785 break;
2786 case NVPTXISD::TexUnified3DU32FloatGrad:
2787 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
2788 break;
2789 case NVPTXISD::TexUnifiedCubeFloatFloat:
2790 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
2791 break;
2792 case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2793 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
2794 break;
2795 case NVPTXISD::TexUnifiedCubeS32Float:
2796 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
2797 break;
2798 case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2799 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
2800 break;
2801 case NVPTXISD::TexUnifiedCubeU32Float:
2802 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
2803 break;
2804 case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2805 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
2806 break;
2807 case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2808 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
2809 break;
2810 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2811 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
2812 break;
2813 case NVPTXISD::TexUnifiedCubeArrayS32Float:
2814 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
2815 break;
2816 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2817 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
2818 break;
2819 case NVPTXISD::TexUnifiedCubeArrayU32Float:
2820 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
2821 break;
2822 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2823 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
2824 break;
2825 case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2826 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
2827 break;
2828 case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2829 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
2830 break;
2831 case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2832 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
2833 break;
2834 case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2835 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
2836 break;
2837 case NVPTXISD::Tld4UnifiedR2DS64Float:
2838 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
2839 break;
2840 case NVPTXISD::Tld4UnifiedG2DS64Float:
2841 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
2842 break;
2843 case NVPTXISD::Tld4UnifiedB2DS64Float:
2844 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
2845 break;
2846 case NVPTXISD::Tld4UnifiedA2DS64Float:
2847 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
2848 break;
2849 case NVPTXISD::Tld4UnifiedR2DU64Float:
2850 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
2851 break;
2852 case NVPTXISD::Tld4UnifiedG2DU64Float:
2853 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
2854 break;
2855 case NVPTXISD::Tld4UnifiedB2DU64Float:
2856 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
2857 break;
2858 case NVPTXISD::Tld4UnifiedA2DU64Float:
2859 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
2860 break;
2861 }
2862
2863 // Copy over operands
2864 SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
2865 Ops.push_back(N->getOperand(0)); // Move chain to the back.
2866
2867 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2868 return true;
2869 }
2870
trySurfaceIntrinsic(SDNode * N)2871 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2872 unsigned Opc = 0;
2873 switch (N->getOpcode()) {
2874 default: return false;
2875 case NVPTXISD::Suld1DI8Clamp:
2876 Opc = NVPTX::SULD_1D_I8_CLAMP;
2877 break;
2878 case NVPTXISD::Suld1DI16Clamp:
2879 Opc = NVPTX::SULD_1D_I16_CLAMP;
2880 break;
2881 case NVPTXISD::Suld1DI32Clamp:
2882 Opc = NVPTX::SULD_1D_I32_CLAMP;
2883 break;
2884 case NVPTXISD::Suld1DI64Clamp:
2885 Opc = NVPTX::SULD_1D_I64_CLAMP;
2886 break;
2887 case NVPTXISD::Suld1DV2I8Clamp:
2888 Opc = NVPTX::SULD_1D_V2I8_CLAMP;
2889 break;
2890 case NVPTXISD::Suld1DV2I16Clamp:
2891 Opc = NVPTX::SULD_1D_V2I16_CLAMP;
2892 break;
2893 case NVPTXISD::Suld1DV2I32Clamp:
2894 Opc = NVPTX::SULD_1D_V2I32_CLAMP;
2895 break;
2896 case NVPTXISD::Suld1DV2I64Clamp:
2897 Opc = NVPTX::SULD_1D_V2I64_CLAMP;
2898 break;
2899 case NVPTXISD::Suld1DV4I8Clamp:
2900 Opc = NVPTX::SULD_1D_V4I8_CLAMP;
2901 break;
2902 case NVPTXISD::Suld1DV4I16Clamp:
2903 Opc = NVPTX::SULD_1D_V4I16_CLAMP;
2904 break;
2905 case NVPTXISD::Suld1DV4I32Clamp:
2906 Opc = NVPTX::SULD_1D_V4I32_CLAMP;
2907 break;
2908 case NVPTXISD::Suld1DArrayI8Clamp:
2909 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
2910 break;
2911 case NVPTXISD::Suld1DArrayI16Clamp:
2912 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
2913 break;
2914 case NVPTXISD::Suld1DArrayI32Clamp:
2915 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
2916 break;
2917 case NVPTXISD::Suld1DArrayI64Clamp:
2918 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
2919 break;
2920 case NVPTXISD::Suld1DArrayV2I8Clamp:
2921 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
2922 break;
2923 case NVPTXISD::Suld1DArrayV2I16Clamp:
2924 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
2925 break;
2926 case NVPTXISD::Suld1DArrayV2I32Clamp:
2927 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
2928 break;
2929 case NVPTXISD::Suld1DArrayV2I64Clamp:
2930 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
2931 break;
2932 case NVPTXISD::Suld1DArrayV4I8Clamp:
2933 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
2934 break;
2935 case NVPTXISD::Suld1DArrayV4I16Clamp:
2936 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
2937 break;
2938 case NVPTXISD::Suld1DArrayV4I32Clamp:
2939 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
2940 break;
2941 case NVPTXISD::Suld2DI8Clamp:
2942 Opc = NVPTX::SULD_2D_I8_CLAMP;
2943 break;
2944 case NVPTXISD::Suld2DI16Clamp:
2945 Opc = NVPTX::SULD_2D_I16_CLAMP;
2946 break;
2947 case NVPTXISD::Suld2DI32Clamp:
2948 Opc = NVPTX::SULD_2D_I32_CLAMP;
2949 break;
2950 case NVPTXISD::Suld2DI64Clamp:
2951 Opc = NVPTX::SULD_2D_I64_CLAMP;
2952 break;
2953 case NVPTXISD::Suld2DV2I8Clamp:
2954 Opc = NVPTX::SULD_2D_V2I8_CLAMP;
2955 break;
2956 case NVPTXISD::Suld2DV2I16Clamp:
2957 Opc = NVPTX::SULD_2D_V2I16_CLAMP;
2958 break;
2959 case NVPTXISD::Suld2DV2I32Clamp:
2960 Opc = NVPTX::SULD_2D_V2I32_CLAMP;
2961 break;
2962 case NVPTXISD::Suld2DV2I64Clamp:
2963 Opc = NVPTX::SULD_2D_V2I64_CLAMP;
2964 break;
2965 case NVPTXISD::Suld2DV4I8Clamp:
2966 Opc = NVPTX::SULD_2D_V4I8_CLAMP;
2967 break;
2968 case NVPTXISD::Suld2DV4I16Clamp:
2969 Opc = NVPTX::SULD_2D_V4I16_CLAMP;
2970 break;
2971 case NVPTXISD::Suld2DV4I32Clamp:
2972 Opc = NVPTX::SULD_2D_V4I32_CLAMP;
2973 break;
2974 case NVPTXISD::Suld2DArrayI8Clamp:
2975 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
2976 break;
2977 case NVPTXISD::Suld2DArrayI16Clamp:
2978 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
2979 break;
2980 case NVPTXISD::Suld2DArrayI32Clamp:
2981 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
2982 break;
2983 case NVPTXISD::Suld2DArrayI64Clamp:
2984 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
2985 break;
2986 case NVPTXISD::Suld2DArrayV2I8Clamp:
2987 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
2988 break;
2989 case NVPTXISD::Suld2DArrayV2I16Clamp:
2990 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
2991 break;
2992 case NVPTXISD::Suld2DArrayV2I32Clamp:
2993 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
2994 break;
2995 case NVPTXISD::Suld2DArrayV2I64Clamp:
2996 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
2997 break;
2998 case NVPTXISD::Suld2DArrayV4I8Clamp:
2999 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
3000 break;
3001 case NVPTXISD::Suld2DArrayV4I16Clamp:
3002 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
3003 break;
3004 case NVPTXISD::Suld2DArrayV4I32Clamp:
3005 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
3006 break;
3007 case NVPTXISD::Suld3DI8Clamp:
3008 Opc = NVPTX::SULD_3D_I8_CLAMP;
3009 break;
3010 case NVPTXISD::Suld3DI16Clamp:
3011 Opc = NVPTX::SULD_3D_I16_CLAMP;
3012 break;
3013 case NVPTXISD::Suld3DI32Clamp:
3014 Opc = NVPTX::SULD_3D_I32_CLAMP;
3015 break;
3016 case NVPTXISD::Suld3DI64Clamp:
3017 Opc = NVPTX::SULD_3D_I64_CLAMP;
3018 break;
3019 case NVPTXISD::Suld3DV2I8Clamp:
3020 Opc = NVPTX::SULD_3D_V2I8_CLAMP;
3021 break;
3022 case NVPTXISD::Suld3DV2I16Clamp:
3023 Opc = NVPTX::SULD_3D_V2I16_CLAMP;
3024 break;
3025 case NVPTXISD::Suld3DV2I32Clamp:
3026 Opc = NVPTX::SULD_3D_V2I32_CLAMP;
3027 break;
3028 case NVPTXISD::Suld3DV2I64Clamp:
3029 Opc = NVPTX::SULD_3D_V2I64_CLAMP;
3030 break;
3031 case NVPTXISD::Suld3DV4I8Clamp:
3032 Opc = NVPTX::SULD_3D_V4I8_CLAMP;
3033 break;
3034 case NVPTXISD::Suld3DV4I16Clamp:
3035 Opc = NVPTX::SULD_3D_V4I16_CLAMP;
3036 break;
3037 case NVPTXISD::Suld3DV4I32Clamp:
3038 Opc = NVPTX::SULD_3D_V4I32_CLAMP;
3039 break;
3040 case NVPTXISD::Suld1DI8Trap:
3041 Opc = NVPTX::SULD_1D_I8_TRAP;
3042 break;
3043 case NVPTXISD::Suld1DI16Trap:
3044 Opc = NVPTX::SULD_1D_I16_TRAP;
3045 break;
3046 case NVPTXISD::Suld1DI32Trap:
3047 Opc = NVPTX::SULD_1D_I32_TRAP;
3048 break;
3049 case NVPTXISD::Suld1DI64Trap:
3050 Opc = NVPTX::SULD_1D_I64_TRAP;
3051 break;
3052 case NVPTXISD::Suld1DV2I8Trap:
3053 Opc = NVPTX::SULD_1D_V2I8_TRAP;
3054 break;
3055 case NVPTXISD::Suld1DV2I16Trap:
3056 Opc = NVPTX::SULD_1D_V2I16_TRAP;
3057 break;
3058 case NVPTXISD::Suld1DV2I32Trap:
3059 Opc = NVPTX::SULD_1D_V2I32_TRAP;
3060 break;
3061 case NVPTXISD::Suld1DV2I64Trap:
3062 Opc = NVPTX::SULD_1D_V2I64_TRAP;
3063 break;
3064 case NVPTXISD::Suld1DV4I8Trap:
3065 Opc = NVPTX::SULD_1D_V4I8_TRAP;
3066 break;
3067 case NVPTXISD::Suld1DV4I16Trap:
3068 Opc = NVPTX::SULD_1D_V4I16_TRAP;
3069 break;
3070 case NVPTXISD::Suld1DV4I32Trap:
3071 Opc = NVPTX::SULD_1D_V4I32_TRAP;
3072 break;
3073 case NVPTXISD::Suld1DArrayI8Trap:
3074 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
3075 break;
3076 case NVPTXISD::Suld1DArrayI16Trap:
3077 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
3078 break;
3079 case NVPTXISD::Suld1DArrayI32Trap:
3080 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
3081 break;
3082 case NVPTXISD::Suld1DArrayI64Trap:
3083 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
3084 break;
3085 case NVPTXISD::Suld1DArrayV2I8Trap:
3086 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
3087 break;
3088 case NVPTXISD::Suld1DArrayV2I16Trap:
3089 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
3090 break;
3091 case NVPTXISD::Suld1DArrayV2I32Trap:
3092 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
3093 break;
3094 case NVPTXISD::Suld1DArrayV2I64Trap:
3095 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
3096 break;
3097 case NVPTXISD::Suld1DArrayV4I8Trap:
3098 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
3099 break;
3100 case NVPTXISD::Suld1DArrayV4I16Trap:
3101 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
3102 break;
3103 case NVPTXISD::Suld1DArrayV4I32Trap:
3104 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
3105 break;
3106 case NVPTXISD::Suld2DI8Trap:
3107 Opc = NVPTX::SULD_2D_I8_TRAP;
3108 break;
3109 case NVPTXISD::Suld2DI16Trap:
3110 Opc = NVPTX::SULD_2D_I16_TRAP;
3111 break;
3112 case NVPTXISD::Suld2DI32Trap:
3113 Opc = NVPTX::SULD_2D_I32_TRAP;
3114 break;
3115 case NVPTXISD::Suld2DI64Trap:
3116 Opc = NVPTX::SULD_2D_I64_TRAP;
3117 break;
3118 case NVPTXISD::Suld2DV2I8Trap:
3119 Opc = NVPTX::SULD_2D_V2I8_TRAP;
3120 break;
3121 case NVPTXISD::Suld2DV2I16Trap:
3122 Opc = NVPTX::SULD_2D_V2I16_TRAP;
3123 break;
3124 case NVPTXISD::Suld2DV2I32Trap:
3125 Opc = NVPTX::SULD_2D_V2I32_TRAP;
3126 break;
3127 case NVPTXISD::Suld2DV2I64Trap:
3128 Opc = NVPTX::SULD_2D_V2I64_TRAP;
3129 break;
3130 case NVPTXISD::Suld2DV4I8Trap:
3131 Opc = NVPTX::SULD_2D_V4I8_TRAP;
3132 break;
3133 case NVPTXISD::Suld2DV4I16Trap:
3134 Opc = NVPTX::SULD_2D_V4I16_TRAP;
3135 break;
3136 case NVPTXISD::Suld2DV4I32Trap:
3137 Opc = NVPTX::SULD_2D_V4I32_TRAP;
3138 break;
3139 case NVPTXISD::Suld2DArrayI8Trap:
3140 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
3141 break;
3142 case NVPTXISD::Suld2DArrayI16Trap:
3143 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
3144 break;
3145 case NVPTXISD::Suld2DArrayI32Trap:
3146 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
3147 break;
3148 case NVPTXISD::Suld2DArrayI64Trap:
3149 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
3150 break;
3151 case NVPTXISD::Suld2DArrayV2I8Trap:
3152 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
3153 break;
3154 case NVPTXISD::Suld2DArrayV2I16Trap:
3155 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
3156 break;
3157 case NVPTXISD::Suld2DArrayV2I32Trap:
3158 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
3159 break;
3160 case NVPTXISD::Suld2DArrayV2I64Trap:
3161 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
3162 break;
3163 case NVPTXISD::Suld2DArrayV4I8Trap:
3164 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
3165 break;
3166 case NVPTXISD::Suld2DArrayV4I16Trap:
3167 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
3168 break;
3169 case NVPTXISD::Suld2DArrayV4I32Trap:
3170 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
3171 break;
3172 case NVPTXISD::Suld3DI8Trap:
3173 Opc = NVPTX::SULD_3D_I8_TRAP;
3174 break;
3175 case NVPTXISD::Suld3DI16Trap:
3176 Opc = NVPTX::SULD_3D_I16_TRAP;
3177 break;
3178 case NVPTXISD::Suld3DI32Trap:
3179 Opc = NVPTX::SULD_3D_I32_TRAP;
3180 break;
3181 case NVPTXISD::Suld3DI64Trap:
3182 Opc = NVPTX::SULD_3D_I64_TRAP;
3183 break;
3184 case NVPTXISD::Suld3DV2I8Trap:
3185 Opc = NVPTX::SULD_3D_V2I8_TRAP;
3186 break;
3187 case NVPTXISD::Suld3DV2I16Trap:
3188 Opc = NVPTX::SULD_3D_V2I16_TRAP;
3189 break;
3190 case NVPTXISD::Suld3DV2I32Trap:
3191 Opc = NVPTX::SULD_3D_V2I32_TRAP;
3192 break;
3193 case NVPTXISD::Suld3DV2I64Trap:
3194 Opc = NVPTX::SULD_3D_V2I64_TRAP;
3195 break;
3196 case NVPTXISD::Suld3DV4I8Trap:
3197 Opc = NVPTX::SULD_3D_V4I8_TRAP;
3198 break;
3199 case NVPTXISD::Suld3DV4I16Trap:
3200 Opc = NVPTX::SULD_3D_V4I16_TRAP;
3201 break;
3202 case NVPTXISD::Suld3DV4I32Trap:
3203 Opc = NVPTX::SULD_3D_V4I32_TRAP;
3204 break;
3205 case NVPTXISD::Suld1DI8Zero:
3206 Opc = NVPTX::SULD_1D_I8_ZERO;
3207 break;
3208 case NVPTXISD::Suld1DI16Zero:
3209 Opc = NVPTX::SULD_1D_I16_ZERO;
3210 break;
3211 case NVPTXISD::Suld1DI32Zero:
3212 Opc = NVPTX::SULD_1D_I32_ZERO;
3213 break;
3214 case NVPTXISD::Suld1DI64Zero:
3215 Opc = NVPTX::SULD_1D_I64_ZERO;
3216 break;
3217 case NVPTXISD::Suld1DV2I8Zero:
3218 Opc = NVPTX::SULD_1D_V2I8_ZERO;
3219 break;
3220 case NVPTXISD::Suld1DV2I16Zero:
3221 Opc = NVPTX::SULD_1D_V2I16_ZERO;
3222 break;
3223 case NVPTXISD::Suld1DV2I32Zero:
3224 Opc = NVPTX::SULD_1D_V2I32_ZERO;
3225 break;
3226 case NVPTXISD::Suld1DV2I64Zero:
3227 Opc = NVPTX::SULD_1D_V2I64_ZERO;
3228 break;
3229 case NVPTXISD::Suld1DV4I8Zero:
3230 Opc = NVPTX::SULD_1D_V4I8_ZERO;
3231 break;
3232 case NVPTXISD::Suld1DV4I16Zero:
3233 Opc = NVPTX::SULD_1D_V4I16_ZERO;
3234 break;
3235 case NVPTXISD::Suld1DV4I32Zero:
3236 Opc = NVPTX::SULD_1D_V4I32_ZERO;
3237 break;
3238 case NVPTXISD::Suld1DArrayI8Zero:
3239 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
3240 break;
3241 case NVPTXISD::Suld1DArrayI16Zero:
3242 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
3243 break;
3244 case NVPTXISD::Suld1DArrayI32Zero:
3245 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
3246 break;
3247 case NVPTXISD::Suld1DArrayI64Zero:
3248 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
3249 break;
3250 case NVPTXISD::Suld1DArrayV2I8Zero:
3251 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
3252 break;
3253 case NVPTXISD::Suld1DArrayV2I16Zero:
3254 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
3255 break;
3256 case NVPTXISD::Suld1DArrayV2I32Zero:
3257 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
3258 break;
3259 case NVPTXISD::Suld1DArrayV2I64Zero:
3260 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
3261 break;
3262 case NVPTXISD::Suld1DArrayV4I8Zero:
3263 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
3264 break;
3265 case NVPTXISD::Suld1DArrayV4I16Zero:
3266 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
3267 break;
3268 case NVPTXISD::Suld1DArrayV4I32Zero:
3269 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
3270 break;
3271 case NVPTXISD::Suld2DI8Zero:
3272 Opc = NVPTX::SULD_2D_I8_ZERO;
3273 break;
3274 case NVPTXISD::Suld2DI16Zero:
3275 Opc = NVPTX::SULD_2D_I16_ZERO;
3276 break;
3277 case NVPTXISD::Suld2DI32Zero:
3278 Opc = NVPTX::SULD_2D_I32_ZERO;
3279 break;
3280 case NVPTXISD::Suld2DI64Zero:
3281 Opc = NVPTX::SULD_2D_I64_ZERO;
3282 break;
3283 case NVPTXISD::Suld2DV2I8Zero:
3284 Opc = NVPTX::SULD_2D_V2I8_ZERO;
3285 break;
3286 case NVPTXISD::Suld2DV2I16Zero:
3287 Opc = NVPTX::SULD_2D_V2I16_ZERO;
3288 break;
3289 case NVPTXISD::Suld2DV2I32Zero:
3290 Opc = NVPTX::SULD_2D_V2I32_ZERO;
3291 break;
3292 case NVPTXISD::Suld2DV2I64Zero:
3293 Opc = NVPTX::SULD_2D_V2I64_ZERO;
3294 break;
3295 case NVPTXISD::Suld2DV4I8Zero:
3296 Opc = NVPTX::SULD_2D_V4I8_ZERO;
3297 break;
3298 case NVPTXISD::Suld2DV4I16Zero:
3299 Opc = NVPTX::SULD_2D_V4I16_ZERO;
3300 break;
3301 case NVPTXISD::Suld2DV4I32Zero:
3302 Opc = NVPTX::SULD_2D_V4I32_ZERO;
3303 break;
3304 case NVPTXISD::Suld2DArrayI8Zero:
3305 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
3306 break;
3307 case NVPTXISD::Suld2DArrayI16Zero:
3308 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
3309 break;
3310 case NVPTXISD::Suld2DArrayI32Zero:
3311 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
3312 break;
3313 case NVPTXISD::Suld2DArrayI64Zero:
3314 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
3315 break;
3316 case NVPTXISD::Suld2DArrayV2I8Zero:
3317 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
3318 break;
3319 case NVPTXISD::Suld2DArrayV2I16Zero:
3320 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
3321 break;
3322 case NVPTXISD::Suld2DArrayV2I32Zero:
3323 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
3324 break;
3325 case NVPTXISD::Suld2DArrayV2I64Zero:
3326 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
3327 break;
3328 case NVPTXISD::Suld2DArrayV4I8Zero:
3329 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
3330 break;
3331 case NVPTXISD::Suld2DArrayV4I16Zero:
3332 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
3333 break;
3334 case NVPTXISD::Suld2DArrayV4I32Zero:
3335 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
3336 break;
3337 case NVPTXISD::Suld3DI8Zero:
3338 Opc = NVPTX::SULD_3D_I8_ZERO;
3339 break;
3340 case NVPTXISD::Suld3DI16Zero:
3341 Opc = NVPTX::SULD_3D_I16_ZERO;
3342 break;
3343 case NVPTXISD::Suld3DI32Zero:
3344 Opc = NVPTX::SULD_3D_I32_ZERO;
3345 break;
3346 case NVPTXISD::Suld3DI64Zero:
3347 Opc = NVPTX::SULD_3D_I64_ZERO;
3348 break;
3349 case NVPTXISD::Suld3DV2I8Zero:
3350 Opc = NVPTX::SULD_3D_V2I8_ZERO;
3351 break;
3352 case NVPTXISD::Suld3DV2I16Zero:
3353 Opc = NVPTX::SULD_3D_V2I16_ZERO;
3354 break;
3355 case NVPTXISD::Suld3DV2I32Zero:
3356 Opc = NVPTX::SULD_3D_V2I32_ZERO;
3357 break;
3358 case NVPTXISD::Suld3DV2I64Zero:
3359 Opc = NVPTX::SULD_3D_V2I64_ZERO;
3360 break;
3361 case NVPTXISD::Suld3DV4I8Zero:
3362 Opc = NVPTX::SULD_3D_V4I8_ZERO;
3363 break;
3364 case NVPTXISD::Suld3DV4I16Zero:
3365 Opc = NVPTX::SULD_3D_V4I16_ZERO;
3366 break;
3367 case NVPTXISD::Suld3DV4I32Zero:
3368 Opc = NVPTX::SULD_3D_V4I32_ZERO;
3369 break;
3370 }
3371
3372 // Copy over operands
3373 SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
3374 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3375
3376 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3377 return true;
3378 }
3379
3380
3381 /// SelectBFE - Look for instruction sequences that can be made more efficient
3382 /// by using the 'bfe' (bit-field extract) PTX instruction
tryBFE(SDNode * N)3383 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3384 SDLoc DL(N);
3385 SDValue LHS = N->getOperand(0);
3386 SDValue RHS = N->getOperand(1);
3387 SDValue Len;
3388 SDValue Start;
3389 SDValue Val;
3390 bool IsSigned = false;
3391
3392 if (N->getOpcode() == ISD::AND) {
3393 // Canonicalize the operands
3394 // We want 'and %val, %mask'
3395 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3396 std::swap(LHS, RHS);
3397 }
3398
3399 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3400 if (!Mask) {
3401 // We need a constant mask on the RHS of the AND
3402 return false;
3403 }
3404
3405 // Extract the mask bits
3406 uint64_t MaskVal = Mask->getZExtValue();
3407 if (!isMask_64(MaskVal)) {
3408 // We *could* handle shifted masks here, but doing so would require an
3409 // 'and' operation to fix up the low-order bits so we would trade
3410 // shr+and for bfe+and, which has the same throughput
3411 return false;
3412 }
3413
3414 // How many bits are in our mask?
3415 uint64_t NumBits = countTrailingOnes(MaskVal);
3416 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3417
3418 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3419 // We have a 'srl/and' pair, extract the effective start bit and length
3420 Val = LHS.getNode()->getOperand(0);
3421 Start = LHS.getNode()->getOperand(1);
3422 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3423 if (StartConst) {
3424 uint64_t StartVal = StartConst->getZExtValue();
3425 // How many "good" bits do we have left? "good" is defined here as bits
3426 // that exist in the original value, not shifted in.
3427 uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3428 if (NumBits > GoodBits) {
3429 // Do not handle the case where bits have been shifted in. In theory
3430 // we could handle this, but the cost is likely higher than just
3431 // emitting the srl/and pair.
3432 return false;
3433 }
3434 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3435 } else {
3436 // Do not handle the case where the shift amount (can be zero if no srl
3437 // was found) is not constant. We could handle this case, but it would
3438 // require run-time logic that would be more expensive than just
3439 // emitting the srl/and pair.
3440 return false;
3441 }
3442 } else {
3443 // Do not handle the case where the LHS of the and is not a shift. While
3444 // it would be trivial to handle this case, it would just transform
3445 // 'and' -> 'bfe', but 'and' has higher-throughput.
3446 return false;
3447 }
3448 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3449 if (LHS->getOpcode() == ISD::AND) {
3450 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3451 if (!ShiftCnst) {
3452 // Shift amount must be constant
3453 return false;
3454 }
3455
3456 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3457
3458 SDValue AndLHS = LHS->getOperand(0);
3459 SDValue AndRHS = LHS->getOperand(1);
3460
3461 // Canonicalize the AND to have the mask on the RHS
3462 if (isa<ConstantSDNode>(AndLHS)) {
3463 std::swap(AndLHS, AndRHS);
3464 }
3465
3466 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3467 if (!MaskCnst) {
3468 // Mask must be constant
3469 return false;
3470 }
3471
3472 uint64_t MaskVal = MaskCnst->getZExtValue();
3473 uint64_t NumZeros;
3474 uint64_t NumBits;
3475 if (isMask_64(MaskVal)) {
3476 NumZeros = 0;
3477 // The number of bits in the result bitfield will be the number of
3478 // trailing ones (the AND) minus the number of bits we shift off
3479 NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3480 } else if (isShiftedMask_64(MaskVal)) {
3481 NumZeros = countTrailingZeros(MaskVal);
3482 unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3483 // The number of bits in the result bitfield will be the number of
3484 // trailing zeros plus the number of set bits in the mask minus the
3485 // number of bits we shift off
3486 NumBits = NumZeros + NumOnes - ShiftAmt;
3487 } else {
3488 // This is not a mask we can handle
3489 return false;
3490 }
3491
3492 if (ShiftAmt < NumZeros) {
3493 // Handling this case would require extra logic that would make this
3494 // transformation non-profitable
3495 return false;
3496 }
3497
3498 Val = AndLHS;
3499 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3500 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3501 } else if (LHS->getOpcode() == ISD::SHL) {
3502 // Here, we have a pattern like:
3503 //
3504 // (sra (shl val, NN), MM)
3505 // or
3506 // (srl (shl val, NN), MM)
3507 //
3508 // If MM >= NN, we can efficiently optimize this with bfe
3509 Val = LHS->getOperand(0);
3510
3511 SDValue ShlRHS = LHS->getOperand(1);
3512 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3513 if (!ShlCnst) {
3514 // Shift amount must be constant
3515 return false;
3516 }
3517 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3518
3519 SDValue ShrRHS = RHS;
3520 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3521 if (!ShrCnst) {
3522 // Shift amount must be constant
3523 return false;
3524 }
3525 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3526
3527 // To avoid extra codegen and be profitable, we need Outer >= Inner
3528 if (OuterShiftAmt < InnerShiftAmt) {
3529 return false;
3530 }
3531
3532 // If the outer shift is more than the type size, we have no bitfield to
3533 // extract (since we also check that the inner shift is <= the outer shift
3534 // then this also implies that the inner shift is < the type size)
3535 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3536 return false;
3537 }
3538
3539 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3540 MVT::i32);
3541 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3542 DL, MVT::i32);
3543
3544 if (N->getOpcode() == ISD::SRA) {
3545 // If we have a arithmetic right shift, we need to use the signed bfe
3546 // variant
3547 IsSigned = true;
3548 }
3549 } else {
3550 // No can do...
3551 return false;
3552 }
3553 } else {
3554 // No can do...
3555 return false;
3556 }
3557
3558
3559 unsigned Opc;
3560 // For the BFE operations we form here from "and" and "srl", always use the
3561 // unsigned variants.
3562 if (Val.getValueType() == MVT::i32) {
3563 if (IsSigned) {
3564 Opc = NVPTX::BFE_S32rii;
3565 } else {
3566 Opc = NVPTX::BFE_U32rii;
3567 }
3568 } else if (Val.getValueType() == MVT::i64) {
3569 if (IsSigned) {
3570 Opc = NVPTX::BFE_S64rii;
3571 } else {
3572 Opc = NVPTX::BFE_U64rii;
3573 }
3574 } else {
3575 // We cannot handle this type
3576 return false;
3577 }
3578
3579 SDValue Ops[] = {
3580 Val, Start, Len
3581 };
3582
3583 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3584 return true;
3585 }
3586
3587 // SelectDirectAddr - Match a direct address for DAG.
3588 // A direct address could be a globaladdress or externalsymbol.
SelectDirectAddr(SDValue N,SDValue & Address)3589 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3590 // Return true if TGA or ES.
3591 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3592 N.getOpcode() == ISD::TargetExternalSymbol) {
3593 Address = N;
3594 return true;
3595 }
3596 if (N.getOpcode() == NVPTXISD::Wrapper) {
3597 Address = N.getOperand(0);
3598 return true;
3599 }
3600 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3601 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3602 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3603 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3604 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3605 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3606 }
3607 return false;
3608 }
3609
3610 // symbol+offset
SelectADDRsi_imp(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset,MVT mvt)3611 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3612 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3613 if (Addr.getOpcode() == ISD::ADD) {
3614 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3615 SDValue base = Addr.getOperand(0);
3616 if (SelectDirectAddr(base, Base)) {
3617 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3618 mvt);
3619 return true;
3620 }
3621 }
3622 }
3623 return false;
3624 }
3625
3626 // symbol+offset
SelectADDRsi(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3627 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3628 SDValue &Base, SDValue &Offset) {
3629 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3630 }
3631
3632 // symbol+offset
SelectADDRsi64(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3633 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3634 SDValue &Base, SDValue &Offset) {
3635 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3636 }
3637
3638 // register+offset
SelectADDRri_imp(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset,MVT mvt)3639 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3640 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3641 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3642 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3643 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3644 return true;
3645 }
3646 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3647 Addr.getOpcode() == ISD::TargetGlobalAddress)
3648 return false; // direct calls.
3649
3650 if (Addr.getOpcode() == ISD::ADD) {
3651 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3652 return false;
3653 }
3654 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3655 if (FrameIndexSDNode *FIN =
3656 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3657 // Constant offset from frame ref.
3658 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3659 else
3660 Base = Addr.getOperand(0);
3661 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3662 mvt);
3663 return true;
3664 }
3665 }
3666 return false;
3667 }
3668
3669 // register+offset
SelectADDRri(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3670 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3671 SDValue &Base, SDValue &Offset) {
3672 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3673 }
3674
3675 // register+offset
SelectADDRri64(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3676 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3677 SDValue &Base, SDValue &Offset) {
3678 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3679 }
3680
ChkMemSDNodeAddressSpace(SDNode * N,unsigned int spN) const3681 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3682 unsigned int spN) const {
3683 const Value *Src = nullptr;
3684 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3685 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3686 return true;
3687 Src = mN->getMemOperand()->getValue();
3688 }
3689 if (!Src)
3690 return false;
3691 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3692 return (PT->getAddressSpace() == spN);
3693 return false;
3694 }
3695
3696 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3697 /// inline asm expressions.
SelectInlineAsmMemoryOperand(const SDValue & Op,unsigned ConstraintID,std::vector<SDValue> & OutOps)3698 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3699 const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3700 SDValue Op0, Op1;
3701 switch (ConstraintID) {
3702 default:
3703 return true;
3704 case InlineAsm::Constraint_m: // memory
3705 if (SelectDirectAddr(Op, Op0)) {
3706 OutOps.push_back(Op0);
3707 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3708 return false;
3709 }
3710 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3711 OutOps.push_back(Op0);
3712 OutOps.push_back(Op1);
3713 return false;
3714 }
3715 break;
3716 }
3717 return true;
3718 }
3719
3720 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3721 /// conversion from \p SrcTy to \p DestTy.
GetConvertOpcode(MVT DestTy,MVT SrcTy,bool IsSigned)3722 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3723 bool IsSigned) {
3724 switch (SrcTy.SimpleTy) {
3725 default:
3726 llvm_unreachable("Unhandled source type");
3727 case MVT::i8:
3728 switch (DestTy.SimpleTy) {
3729 default:
3730 llvm_unreachable("Unhandled dest type");
3731 case MVT::i16:
3732 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3733 case MVT::i32:
3734 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3735 case MVT::i64:
3736 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3737 }
3738 case MVT::i16:
3739 switch (DestTy.SimpleTy) {
3740 default:
3741 llvm_unreachable("Unhandled dest type");
3742 case MVT::i8:
3743 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3744 case MVT::i32:
3745 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3746 case MVT::i64:
3747 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3748 }
3749 case MVT::i32:
3750 switch (DestTy.SimpleTy) {
3751 default:
3752 llvm_unreachable("Unhandled dest type");
3753 case MVT::i8:
3754 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3755 case MVT::i16:
3756 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3757 case MVT::i64:
3758 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3759 }
3760 case MVT::i64:
3761 switch (DestTy.SimpleTy) {
3762 default:
3763 llvm_unreachable("Unhandled dest type");
3764 case MVT::i8:
3765 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3766 case MVT::i16:
3767 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3768 case MVT::i32:
3769 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3770 }
3771 }
3772 }
3773