• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines an instruction selector for the NVPTX target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelDAGToDAG.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/Support/AtomicOrdering.h"
20 #include "llvm/Support/CommandLine.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/ErrorHandling.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include "llvm/Target/TargetIntrinsicInfo.h"
25 
26 using namespace llvm;
27 
28 #define DEBUG_TYPE "nvptx-isel"
29 
30 /// createNVPTXISelDag - This pass converts a legalized DAG into a
31 /// NVPTX-specific DAG, ready for instruction scheduling.
createNVPTXISelDag(NVPTXTargetMachine & TM,llvm::CodeGenOpt::Level OptLevel)32 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
33                                        llvm::CodeGenOpt::Level OptLevel) {
34   return new NVPTXDAGToDAGISel(TM, OptLevel);
35 }
36 
NVPTXDAGToDAGISel(NVPTXTargetMachine & tm,CodeGenOpt::Level OptLevel)37 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
38                                      CodeGenOpt::Level OptLevel)
39     : SelectionDAGISel(tm, OptLevel), TM(tm) {
40   doMulWide = (OptLevel > 0);
41 }
42 
runOnMachineFunction(MachineFunction & MF)43 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
44   Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
45   return SelectionDAGISel::runOnMachineFunction(MF);
46 }
47 
getDivF32Level() const48 int NVPTXDAGToDAGISel::getDivF32Level() const {
49   return Subtarget->getTargetLowering()->getDivF32Level();
50 }
51 
usePrecSqrtF32() const52 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
53   return Subtarget->getTargetLowering()->usePrecSqrtF32();
54 }
55 
useF32FTZ() const56 bool NVPTXDAGToDAGISel::useF32FTZ() const {
57   return Subtarget->getTargetLowering()->useF32FTZ(*MF);
58 }
59 
allowFMA() const60 bool NVPTXDAGToDAGISel::allowFMA() const {
61   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
62   return TL->allowFMA(*MF, OptLevel);
63 }
64 
allowUnsafeFPMath() const65 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
66   const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
67   return TL->allowUnsafeFPMath(*MF);
68 }
69 
useShortPointers() const70 bool NVPTXDAGToDAGISel::useShortPointers() const {
71   return TM.useShortPointers();
72 }
73 
74 /// Select - Select instructions not customized! Used for
75 /// expanded, promoted and normal instructions.
Select(SDNode * N)76 void NVPTXDAGToDAGISel::Select(SDNode *N) {
77 
78   if (N->isMachineOpcode()) {
79     N->setNodeId(-1);
80     return; // Already selected.
81   }
82 
83   switch (N->getOpcode()) {
84   case ISD::LOAD:
85   case ISD::ATOMIC_LOAD:
86     if (tryLoad(N))
87       return;
88     break;
89   case ISD::STORE:
90   case ISD::ATOMIC_STORE:
91     if (tryStore(N))
92       return;
93     break;
94   case ISD::EXTRACT_VECTOR_ELT:
95     if (tryEXTRACT_VECTOR_ELEMENT(N))
96       return;
97     break;
98   case NVPTXISD::SETP_F16X2:
99     SelectSETP_F16X2(N);
100     return;
101 
102   case NVPTXISD::LoadV2:
103   case NVPTXISD::LoadV4:
104     if (tryLoadVector(N))
105       return;
106     break;
107   case NVPTXISD::LDGV2:
108   case NVPTXISD::LDGV4:
109   case NVPTXISD::LDUV2:
110   case NVPTXISD::LDUV4:
111     if (tryLDGLDU(N))
112       return;
113     break;
114   case NVPTXISD::StoreV2:
115   case NVPTXISD::StoreV4:
116     if (tryStoreVector(N))
117       return;
118     break;
119   case NVPTXISD::LoadParam:
120   case NVPTXISD::LoadParamV2:
121   case NVPTXISD::LoadParamV4:
122     if (tryLoadParam(N))
123       return;
124     break;
125   case NVPTXISD::StoreRetval:
126   case NVPTXISD::StoreRetvalV2:
127   case NVPTXISD::StoreRetvalV4:
128     if (tryStoreRetval(N))
129       return;
130     break;
131   case NVPTXISD::StoreParam:
132   case NVPTXISD::StoreParamV2:
133   case NVPTXISD::StoreParamV4:
134   case NVPTXISD::StoreParamS32:
135   case NVPTXISD::StoreParamU32:
136     if (tryStoreParam(N))
137       return;
138     break;
139   case ISD::INTRINSIC_WO_CHAIN:
140     if (tryIntrinsicNoChain(N))
141       return;
142     break;
143   case ISD::INTRINSIC_W_CHAIN:
144     if (tryIntrinsicChain(N))
145       return;
146     break;
147   case NVPTXISD::Tex1DFloatS32:
148   case NVPTXISD::Tex1DFloatFloat:
149   case NVPTXISD::Tex1DFloatFloatLevel:
150   case NVPTXISD::Tex1DFloatFloatGrad:
151   case NVPTXISD::Tex1DS32S32:
152   case NVPTXISD::Tex1DS32Float:
153   case NVPTXISD::Tex1DS32FloatLevel:
154   case NVPTXISD::Tex1DS32FloatGrad:
155   case NVPTXISD::Tex1DU32S32:
156   case NVPTXISD::Tex1DU32Float:
157   case NVPTXISD::Tex1DU32FloatLevel:
158   case NVPTXISD::Tex1DU32FloatGrad:
159   case NVPTXISD::Tex1DArrayFloatS32:
160   case NVPTXISD::Tex1DArrayFloatFloat:
161   case NVPTXISD::Tex1DArrayFloatFloatLevel:
162   case NVPTXISD::Tex1DArrayFloatFloatGrad:
163   case NVPTXISD::Tex1DArrayS32S32:
164   case NVPTXISD::Tex1DArrayS32Float:
165   case NVPTXISD::Tex1DArrayS32FloatLevel:
166   case NVPTXISD::Tex1DArrayS32FloatGrad:
167   case NVPTXISD::Tex1DArrayU32S32:
168   case NVPTXISD::Tex1DArrayU32Float:
169   case NVPTXISD::Tex1DArrayU32FloatLevel:
170   case NVPTXISD::Tex1DArrayU32FloatGrad:
171   case NVPTXISD::Tex2DFloatS32:
172   case NVPTXISD::Tex2DFloatFloat:
173   case NVPTXISD::Tex2DFloatFloatLevel:
174   case NVPTXISD::Tex2DFloatFloatGrad:
175   case NVPTXISD::Tex2DS32S32:
176   case NVPTXISD::Tex2DS32Float:
177   case NVPTXISD::Tex2DS32FloatLevel:
178   case NVPTXISD::Tex2DS32FloatGrad:
179   case NVPTXISD::Tex2DU32S32:
180   case NVPTXISD::Tex2DU32Float:
181   case NVPTXISD::Tex2DU32FloatLevel:
182   case NVPTXISD::Tex2DU32FloatGrad:
183   case NVPTXISD::Tex2DArrayFloatS32:
184   case NVPTXISD::Tex2DArrayFloatFloat:
185   case NVPTXISD::Tex2DArrayFloatFloatLevel:
186   case NVPTXISD::Tex2DArrayFloatFloatGrad:
187   case NVPTXISD::Tex2DArrayS32S32:
188   case NVPTXISD::Tex2DArrayS32Float:
189   case NVPTXISD::Tex2DArrayS32FloatLevel:
190   case NVPTXISD::Tex2DArrayS32FloatGrad:
191   case NVPTXISD::Tex2DArrayU32S32:
192   case NVPTXISD::Tex2DArrayU32Float:
193   case NVPTXISD::Tex2DArrayU32FloatLevel:
194   case NVPTXISD::Tex2DArrayU32FloatGrad:
195   case NVPTXISD::Tex3DFloatS32:
196   case NVPTXISD::Tex3DFloatFloat:
197   case NVPTXISD::Tex3DFloatFloatLevel:
198   case NVPTXISD::Tex3DFloatFloatGrad:
199   case NVPTXISD::Tex3DS32S32:
200   case NVPTXISD::Tex3DS32Float:
201   case NVPTXISD::Tex3DS32FloatLevel:
202   case NVPTXISD::Tex3DS32FloatGrad:
203   case NVPTXISD::Tex3DU32S32:
204   case NVPTXISD::Tex3DU32Float:
205   case NVPTXISD::Tex3DU32FloatLevel:
206   case NVPTXISD::Tex3DU32FloatGrad:
207   case NVPTXISD::TexCubeFloatFloat:
208   case NVPTXISD::TexCubeFloatFloatLevel:
209   case NVPTXISD::TexCubeS32Float:
210   case NVPTXISD::TexCubeS32FloatLevel:
211   case NVPTXISD::TexCubeU32Float:
212   case NVPTXISD::TexCubeU32FloatLevel:
213   case NVPTXISD::TexCubeArrayFloatFloat:
214   case NVPTXISD::TexCubeArrayFloatFloatLevel:
215   case NVPTXISD::TexCubeArrayS32Float:
216   case NVPTXISD::TexCubeArrayS32FloatLevel:
217   case NVPTXISD::TexCubeArrayU32Float:
218   case NVPTXISD::TexCubeArrayU32FloatLevel:
219   case NVPTXISD::Tld4R2DFloatFloat:
220   case NVPTXISD::Tld4G2DFloatFloat:
221   case NVPTXISD::Tld4B2DFloatFloat:
222   case NVPTXISD::Tld4A2DFloatFloat:
223   case NVPTXISD::Tld4R2DS64Float:
224   case NVPTXISD::Tld4G2DS64Float:
225   case NVPTXISD::Tld4B2DS64Float:
226   case NVPTXISD::Tld4A2DS64Float:
227   case NVPTXISD::Tld4R2DU64Float:
228   case NVPTXISD::Tld4G2DU64Float:
229   case NVPTXISD::Tld4B2DU64Float:
230   case NVPTXISD::Tld4A2DU64Float:
231   case NVPTXISD::TexUnified1DFloatS32:
232   case NVPTXISD::TexUnified1DFloatFloat:
233   case NVPTXISD::TexUnified1DFloatFloatLevel:
234   case NVPTXISD::TexUnified1DFloatFloatGrad:
235   case NVPTXISD::TexUnified1DS32S32:
236   case NVPTXISD::TexUnified1DS32Float:
237   case NVPTXISD::TexUnified1DS32FloatLevel:
238   case NVPTXISD::TexUnified1DS32FloatGrad:
239   case NVPTXISD::TexUnified1DU32S32:
240   case NVPTXISD::TexUnified1DU32Float:
241   case NVPTXISD::TexUnified1DU32FloatLevel:
242   case NVPTXISD::TexUnified1DU32FloatGrad:
243   case NVPTXISD::TexUnified1DArrayFloatS32:
244   case NVPTXISD::TexUnified1DArrayFloatFloat:
245   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
246   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
247   case NVPTXISD::TexUnified1DArrayS32S32:
248   case NVPTXISD::TexUnified1DArrayS32Float:
249   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
250   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
251   case NVPTXISD::TexUnified1DArrayU32S32:
252   case NVPTXISD::TexUnified1DArrayU32Float:
253   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
254   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
255   case NVPTXISD::TexUnified2DFloatS32:
256   case NVPTXISD::TexUnified2DFloatFloat:
257   case NVPTXISD::TexUnified2DFloatFloatLevel:
258   case NVPTXISD::TexUnified2DFloatFloatGrad:
259   case NVPTXISD::TexUnified2DS32S32:
260   case NVPTXISD::TexUnified2DS32Float:
261   case NVPTXISD::TexUnified2DS32FloatLevel:
262   case NVPTXISD::TexUnified2DS32FloatGrad:
263   case NVPTXISD::TexUnified2DU32S32:
264   case NVPTXISD::TexUnified2DU32Float:
265   case NVPTXISD::TexUnified2DU32FloatLevel:
266   case NVPTXISD::TexUnified2DU32FloatGrad:
267   case NVPTXISD::TexUnified2DArrayFloatS32:
268   case NVPTXISD::TexUnified2DArrayFloatFloat:
269   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
270   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
271   case NVPTXISD::TexUnified2DArrayS32S32:
272   case NVPTXISD::TexUnified2DArrayS32Float:
273   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
274   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
275   case NVPTXISD::TexUnified2DArrayU32S32:
276   case NVPTXISD::TexUnified2DArrayU32Float:
277   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
278   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
279   case NVPTXISD::TexUnified3DFloatS32:
280   case NVPTXISD::TexUnified3DFloatFloat:
281   case NVPTXISD::TexUnified3DFloatFloatLevel:
282   case NVPTXISD::TexUnified3DFloatFloatGrad:
283   case NVPTXISD::TexUnified3DS32S32:
284   case NVPTXISD::TexUnified3DS32Float:
285   case NVPTXISD::TexUnified3DS32FloatLevel:
286   case NVPTXISD::TexUnified3DS32FloatGrad:
287   case NVPTXISD::TexUnified3DU32S32:
288   case NVPTXISD::TexUnified3DU32Float:
289   case NVPTXISD::TexUnified3DU32FloatLevel:
290   case NVPTXISD::TexUnified3DU32FloatGrad:
291   case NVPTXISD::TexUnifiedCubeFloatFloat:
292   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
293   case NVPTXISD::TexUnifiedCubeS32Float:
294   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
295   case NVPTXISD::TexUnifiedCubeU32Float:
296   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
297   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
298   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
299   case NVPTXISD::TexUnifiedCubeArrayS32Float:
300   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
301   case NVPTXISD::TexUnifiedCubeArrayU32Float:
302   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
303   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
304   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
305   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
306   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
307   case NVPTXISD::Tld4UnifiedR2DS64Float:
308   case NVPTXISD::Tld4UnifiedG2DS64Float:
309   case NVPTXISD::Tld4UnifiedB2DS64Float:
310   case NVPTXISD::Tld4UnifiedA2DS64Float:
311   case NVPTXISD::Tld4UnifiedR2DU64Float:
312   case NVPTXISD::Tld4UnifiedG2DU64Float:
313   case NVPTXISD::Tld4UnifiedB2DU64Float:
314   case NVPTXISD::Tld4UnifiedA2DU64Float:
315     if (tryTextureIntrinsic(N))
316       return;
317     break;
318   case NVPTXISD::Suld1DI8Clamp:
319   case NVPTXISD::Suld1DI16Clamp:
320   case NVPTXISD::Suld1DI32Clamp:
321   case NVPTXISD::Suld1DI64Clamp:
322   case NVPTXISD::Suld1DV2I8Clamp:
323   case NVPTXISD::Suld1DV2I16Clamp:
324   case NVPTXISD::Suld1DV2I32Clamp:
325   case NVPTXISD::Suld1DV2I64Clamp:
326   case NVPTXISD::Suld1DV4I8Clamp:
327   case NVPTXISD::Suld1DV4I16Clamp:
328   case NVPTXISD::Suld1DV4I32Clamp:
329   case NVPTXISD::Suld1DArrayI8Clamp:
330   case NVPTXISD::Suld1DArrayI16Clamp:
331   case NVPTXISD::Suld1DArrayI32Clamp:
332   case NVPTXISD::Suld1DArrayI64Clamp:
333   case NVPTXISD::Suld1DArrayV2I8Clamp:
334   case NVPTXISD::Suld1DArrayV2I16Clamp:
335   case NVPTXISD::Suld1DArrayV2I32Clamp:
336   case NVPTXISD::Suld1DArrayV2I64Clamp:
337   case NVPTXISD::Suld1DArrayV4I8Clamp:
338   case NVPTXISD::Suld1DArrayV4I16Clamp:
339   case NVPTXISD::Suld1DArrayV4I32Clamp:
340   case NVPTXISD::Suld2DI8Clamp:
341   case NVPTXISD::Suld2DI16Clamp:
342   case NVPTXISD::Suld2DI32Clamp:
343   case NVPTXISD::Suld2DI64Clamp:
344   case NVPTXISD::Suld2DV2I8Clamp:
345   case NVPTXISD::Suld2DV2I16Clamp:
346   case NVPTXISD::Suld2DV2I32Clamp:
347   case NVPTXISD::Suld2DV2I64Clamp:
348   case NVPTXISD::Suld2DV4I8Clamp:
349   case NVPTXISD::Suld2DV4I16Clamp:
350   case NVPTXISD::Suld2DV4I32Clamp:
351   case NVPTXISD::Suld2DArrayI8Clamp:
352   case NVPTXISD::Suld2DArrayI16Clamp:
353   case NVPTXISD::Suld2DArrayI32Clamp:
354   case NVPTXISD::Suld2DArrayI64Clamp:
355   case NVPTXISD::Suld2DArrayV2I8Clamp:
356   case NVPTXISD::Suld2DArrayV2I16Clamp:
357   case NVPTXISD::Suld2DArrayV2I32Clamp:
358   case NVPTXISD::Suld2DArrayV2I64Clamp:
359   case NVPTXISD::Suld2DArrayV4I8Clamp:
360   case NVPTXISD::Suld2DArrayV4I16Clamp:
361   case NVPTXISD::Suld2DArrayV4I32Clamp:
362   case NVPTXISD::Suld3DI8Clamp:
363   case NVPTXISD::Suld3DI16Clamp:
364   case NVPTXISD::Suld3DI32Clamp:
365   case NVPTXISD::Suld3DI64Clamp:
366   case NVPTXISD::Suld3DV2I8Clamp:
367   case NVPTXISD::Suld3DV2I16Clamp:
368   case NVPTXISD::Suld3DV2I32Clamp:
369   case NVPTXISD::Suld3DV2I64Clamp:
370   case NVPTXISD::Suld3DV4I8Clamp:
371   case NVPTXISD::Suld3DV4I16Clamp:
372   case NVPTXISD::Suld3DV4I32Clamp:
373   case NVPTXISD::Suld1DI8Trap:
374   case NVPTXISD::Suld1DI16Trap:
375   case NVPTXISD::Suld1DI32Trap:
376   case NVPTXISD::Suld1DI64Trap:
377   case NVPTXISD::Suld1DV2I8Trap:
378   case NVPTXISD::Suld1DV2I16Trap:
379   case NVPTXISD::Suld1DV2I32Trap:
380   case NVPTXISD::Suld1DV2I64Trap:
381   case NVPTXISD::Suld1DV4I8Trap:
382   case NVPTXISD::Suld1DV4I16Trap:
383   case NVPTXISD::Suld1DV4I32Trap:
384   case NVPTXISD::Suld1DArrayI8Trap:
385   case NVPTXISD::Suld1DArrayI16Trap:
386   case NVPTXISD::Suld1DArrayI32Trap:
387   case NVPTXISD::Suld1DArrayI64Trap:
388   case NVPTXISD::Suld1DArrayV2I8Trap:
389   case NVPTXISD::Suld1DArrayV2I16Trap:
390   case NVPTXISD::Suld1DArrayV2I32Trap:
391   case NVPTXISD::Suld1DArrayV2I64Trap:
392   case NVPTXISD::Suld1DArrayV4I8Trap:
393   case NVPTXISD::Suld1DArrayV4I16Trap:
394   case NVPTXISD::Suld1DArrayV4I32Trap:
395   case NVPTXISD::Suld2DI8Trap:
396   case NVPTXISD::Suld2DI16Trap:
397   case NVPTXISD::Suld2DI32Trap:
398   case NVPTXISD::Suld2DI64Trap:
399   case NVPTXISD::Suld2DV2I8Trap:
400   case NVPTXISD::Suld2DV2I16Trap:
401   case NVPTXISD::Suld2DV2I32Trap:
402   case NVPTXISD::Suld2DV2I64Trap:
403   case NVPTXISD::Suld2DV4I8Trap:
404   case NVPTXISD::Suld2DV4I16Trap:
405   case NVPTXISD::Suld2DV4I32Trap:
406   case NVPTXISD::Suld2DArrayI8Trap:
407   case NVPTXISD::Suld2DArrayI16Trap:
408   case NVPTXISD::Suld2DArrayI32Trap:
409   case NVPTXISD::Suld2DArrayI64Trap:
410   case NVPTXISD::Suld2DArrayV2I8Trap:
411   case NVPTXISD::Suld2DArrayV2I16Trap:
412   case NVPTXISD::Suld2DArrayV2I32Trap:
413   case NVPTXISD::Suld2DArrayV2I64Trap:
414   case NVPTXISD::Suld2DArrayV4I8Trap:
415   case NVPTXISD::Suld2DArrayV4I16Trap:
416   case NVPTXISD::Suld2DArrayV4I32Trap:
417   case NVPTXISD::Suld3DI8Trap:
418   case NVPTXISD::Suld3DI16Trap:
419   case NVPTXISD::Suld3DI32Trap:
420   case NVPTXISD::Suld3DI64Trap:
421   case NVPTXISD::Suld3DV2I8Trap:
422   case NVPTXISD::Suld3DV2I16Trap:
423   case NVPTXISD::Suld3DV2I32Trap:
424   case NVPTXISD::Suld3DV2I64Trap:
425   case NVPTXISD::Suld3DV4I8Trap:
426   case NVPTXISD::Suld3DV4I16Trap:
427   case NVPTXISD::Suld3DV4I32Trap:
428   case NVPTXISD::Suld1DI8Zero:
429   case NVPTXISD::Suld1DI16Zero:
430   case NVPTXISD::Suld1DI32Zero:
431   case NVPTXISD::Suld1DI64Zero:
432   case NVPTXISD::Suld1DV2I8Zero:
433   case NVPTXISD::Suld1DV2I16Zero:
434   case NVPTXISD::Suld1DV2I32Zero:
435   case NVPTXISD::Suld1DV2I64Zero:
436   case NVPTXISD::Suld1DV4I8Zero:
437   case NVPTXISD::Suld1DV4I16Zero:
438   case NVPTXISD::Suld1DV4I32Zero:
439   case NVPTXISD::Suld1DArrayI8Zero:
440   case NVPTXISD::Suld1DArrayI16Zero:
441   case NVPTXISD::Suld1DArrayI32Zero:
442   case NVPTXISD::Suld1DArrayI64Zero:
443   case NVPTXISD::Suld1DArrayV2I8Zero:
444   case NVPTXISD::Suld1DArrayV2I16Zero:
445   case NVPTXISD::Suld1DArrayV2I32Zero:
446   case NVPTXISD::Suld1DArrayV2I64Zero:
447   case NVPTXISD::Suld1DArrayV4I8Zero:
448   case NVPTXISD::Suld1DArrayV4I16Zero:
449   case NVPTXISD::Suld1DArrayV4I32Zero:
450   case NVPTXISD::Suld2DI8Zero:
451   case NVPTXISD::Suld2DI16Zero:
452   case NVPTXISD::Suld2DI32Zero:
453   case NVPTXISD::Suld2DI64Zero:
454   case NVPTXISD::Suld2DV2I8Zero:
455   case NVPTXISD::Suld2DV2I16Zero:
456   case NVPTXISD::Suld2DV2I32Zero:
457   case NVPTXISD::Suld2DV2I64Zero:
458   case NVPTXISD::Suld2DV4I8Zero:
459   case NVPTXISD::Suld2DV4I16Zero:
460   case NVPTXISD::Suld2DV4I32Zero:
461   case NVPTXISD::Suld2DArrayI8Zero:
462   case NVPTXISD::Suld2DArrayI16Zero:
463   case NVPTXISD::Suld2DArrayI32Zero:
464   case NVPTXISD::Suld2DArrayI64Zero:
465   case NVPTXISD::Suld2DArrayV2I8Zero:
466   case NVPTXISD::Suld2DArrayV2I16Zero:
467   case NVPTXISD::Suld2DArrayV2I32Zero:
468   case NVPTXISD::Suld2DArrayV2I64Zero:
469   case NVPTXISD::Suld2DArrayV4I8Zero:
470   case NVPTXISD::Suld2DArrayV4I16Zero:
471   case NVPTXISD::Suld2DArrayV4I32Zero:
472   case NVPTXISD::Suld3DI8Zero:
473   case NVPTXISD::Suld3DI16Zero:
474   case NVPTXISD::Suld3DI32Zero:
475   case NVPTXISD::Suld3DI64Zero:
476   case NVPTXISD::Suld3DV2I8Zero:
477   case NVPTXISD::Suld3DV2I16Zero:
478   case NVPTXISD::Suld3DV2I32Zero:
479   case NVPTXISD::Suld3DV2I64Zero:
480   case NVPTXISD::Suld3DV4I8Zero:
481   case NVPTXISD::Suld3DV4I16Zero:
482   case NVPTXISD::Suld3DV4I32Zero:
483     if (trySurfaceIntrinsic(N))
484       return;
485     break;
486   case ISD::AND:
487   case ISD::SRA:
488   case ISD::SRL:
489     // Try to select BFE
490     if (tryBFE(N))
491       return;
492     break;
493   case ISD::ADDRSPACECAST:
494     SelectAddrSpaceCast(N);
495     return;
496   case ISD::ConstantFP:
497     if (tryConstantFP16(N))
498       return;
499     break;
500   default:
501     break;
502   }
503   SelectCode(N);
504 }
505 
tryIntrinsicChain(SDNode * N)506 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
507   unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
508   switch (IID) {
509   default:
510     return false;
511   case Intrinsic::nvvm_ldg_global_f:
512   case Intrinsic::nvvm_ldg_global_i:
513   case Intrinsic::nvvm_ldg_global_p:
514   case Intrinsic::nvvm_ldu_global_f:
515   case Intrinsic::nvvm_ldu_global_i:
516   case Intrinsic::nvvm_ldu_global_p:
517     return tryLDGLDU(N);
518   }
519 }
520 
521 // There's no way to specify FP16 immediates in .f16 ops, so we have to
522 // load them into an .f16 register first.
tryConstantFP16(SDNode * N)523 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
524   if (N->getValueType(0) != MVT::f16)
525     return false;
526   SDValue Val = CurDAG->getTargetConstantFP(
527       cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
528   SDNode *LoadConstF16 =
529       CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
530   ReplaceNode(N, LoadConstF16);
531   return true;
532 }
533 
534 // Map ISD:CONDCODE value to appropriate CmpMode expected by
535 // NVPTXInstPrinter::printCmpMode()
getPTXCmpMode(const CondCodeSDNode & CondCode,bool FTZ)536 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
537   using NVPTX::PTXCmpMode::CmpMode;
538   unsigned PTXCmpMode = [](ISD::CondCode CC) {
539     switch (CC) {
540     default:
541       llvm_unreachable("Unexpected condition code.");
542     case ISD::SETOEQ:
543       return CmpMode::EQ;
544     case ISD::SETOGT:
545       return CmpMode::GT;
546     case ISD::SETOGE:
547       return CmpMode::GE;
548     case ISD::SETOLT:
549       return CmpMode::LT;
550     case ISD::SETOLE:
551       return CmpMode::LE;
552     case ISD::SETONE:
553       return CmpMode::NE;
554     case ISD::SETO:
555       return CmpMode::NUM;
556     case ISD::SETUO:
557       return CmpMode::NotANumber;
558     case ISD::SETUEQ:
559       return CmpMode::EQU;
560     case ISD::SETUGT:
561       return CmpMode::GTU;
562     case ISD::SETUGE:
563       return CmpMode::GEU;
564     case ISD::SETULT:
565       return CmpMode::LTU;
566     case ISD::SETULE:
567       return CmpMode::LEU;
568     case ISD::SETUNE:
569       return CmpMode::NEU;
570     case ISD::SETEQ:
571       return CmpMode::EQ;
572     case ISD::SETGT:
573       return CmpMode::GT;
574     case ISD::SETGE:
575       return CmpMode::GE;
576     case ISD::SETLT:
577       return CmpMode::LT;
578     case ISD::SETLE:
579       return CmpMode::LE;
580     case ISD::SETNE:
581       return CmpMode::NE;
582     }
583   }(CondCode.get());
584 
585   if (FTZ)
586     PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
587 
588   return PTXCmpMode;
589 }
590 
SelectSETP_F16X2(SDNode * N)591 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
592   unsigned PTXCmpMode =
593       getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
594   SDLoc DL(N);
595   SDNode *SetP = CurDAG->getMachineNode(
596       NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
597       N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
598   ReplaceNode(N, SetP);
599   return true;
600 }
601 
602 // Find all instances of extract_vector_elt that use this v2f16 vector
603 // and coalesce them into a scattering move instruction.
tryEXTRACT_VECTOR_ELEMENT(SDNode * N)604 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
605   SDValue Vector = N->getOperand(0);
606 
607   // We only care about f16x2 as it's the only real vector type we
608   // need to deal with.
609   if (Vector.getSimpleValueType() != MVT::v2f16)
610     return false;
611 
612   // Find and record all uses of this vector that extract element 0 or 1.
613   SmallVector<SDNode *, 4> E0, E1;
614   for (const auto &U : Vector.getNode()->uses()) {
615     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
616       continue;
617     if (U->getOperand(0) != Vector)
618       continue;
619     if (const ConstantSDNode *IdxConst =
620             dyn_cast<ConstantSDNode>(U->getOperand(1))) {
621       if (IdxConst->getZExtValue() == 0)
622         E0.push_back(U);
623       else if (IdxConst->getZExtValue() == 1)
624         E1.push_back(U);
625       else
626         llvm_unreachable("Invalid vector index.");
627     }
628   }
629 
630   // There's no point scattering f16x2 if we only ever access one
631   // element of it.
632   if (E0.empty() || E1.empty())
633     return false;
634 
635   unsigned Op = NVPTX::SplitF16x2;
636   // If the vector has been BITCAST'ed from i32, we can use original
637   // value directly and avoid register-to-register move.
638   SDValue Source = Vector;
639   if (Vector->getOpcode() == ISD::BITCAST) {
640     Op = NVPTX::SplitI32toF16x2;
641     Source = Vector->getOperand(0);
642   }
643   // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
644   // into f16,f16 SplitF16x2(V)
645   SDNode *ScatterOp =
646       CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
647   for (auto *Node : E0)
648     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
649   for (auto *Node : E1)
650     ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
651 
652   return true;
653 }
654 
getCodeAddrSpace(MemSDNode * N)655 static unsigned int getCodeAddrSpace(MemSDNode *N) {
656   const Value *Src = N->getMemOperand()->getValue();
657 
658   if (!Src)
659     return NVPTX::PTXLdStInstCode::GENERIC;
660 
661   if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
662     switch (PT->getAddressSpace()) {
663     case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
664     case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
665     case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
666     case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
667     case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
668     case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
669     default: break;
670     }
671   }
672   return NVPTX::PTXLdStInstCode::GENERIC;
673 }
674 
canLowerToLDG(MemSDNode * N,const NVPTXSubtarget & Subtarget,unsigned CodeAddrSpace,MachineFunction * F)675 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
676                           unsigned CodeAddrSpace, MachineFunction *F) {
677   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
678   // space.
679   //
680   // We have two ways of identifying invariant loads: Loads may be explicitly
681   // marked as invariant, or we may infer them to be invariant.
682   //
683   // We currently infer invariance for loads from
684   //  - constant global variables, and
685   //  - kernel function pointer params that are noalias (i.e. __restrict) and
686   //    never written to.
687   //
688   // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
689   // not during the SelectionDAG phase).
690   //
691   // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
692   // explicitly invariant loads because these are how clang tells us to use ldg
693   // when the user uses a builtin.
694   if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
695     return false;
696 
697   if (N->isInvariant())
698     return true;
699 
700   bool IsKernelFn = isKernelFunction(F->getFunction());
701 
702   // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
703   // because the former looks through phi nodes while the latter does not. We
704   // need to look through phi nodes to handle pointer induction variables.
705   SmallVector<Value *, 8> Objs;
706   GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
707                        Objs, F->getDataLayout());
708 
709   return all_of(Objs, [&](Value *V) {
710     if (auto *A = dyn_cast<const Argument>(V))
711       return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
712     if (auto *GV = dyn_cast<const GlobalVariable>(V))
713       return GV->isConstant();
714     return false;
715   });
716 }
717 
tryIntrinsicNoChain(SDNode * N)718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
719   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
720   switch (IID) {
721   default:
722     return false;
723   case Intrinsic::nvvm_texsurf_handle_internal:
724     SelectTexSurfHandle(N);
725     return true;
726   }
727 }
728 
SelectTexSurfHandle(SDNode * N)729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
730   // Op 0 is the intrinsic ID
731   SDValue Wrapper = N->getOperand(1);
732   SDValue GlobalVal = Wrapper.getOperand(0);
733   ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
734                                         MVT::i64, GlobalVal));
735 }
736 
SelectAddrSpaceCast(SDNode * N)737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
738   SDValue Src = N->getOperand(0);
739   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
740   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
741   unsigned DstAddrSpace = CastN->getDestAddressSpace();
742   assert(SrcAddrSpace != DstAddrSpace &&
743          "addrspacecast must be between different address spaces");
744 
745   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
746     // Specific to generic
747     unsigned Opc;
748     switch (SrcAddrSpace) {
749     default: report_fatal_error("Bad address space in addrspacecast");
750     case ADDRESS_SPACE_GLOBAL:
751       Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
752       break;
753     case ADDRESS_SPACE_SHARED:
754       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
755                                                : NVPTX::cvta_shared_yes_64)
756                          : NVPTX::cvta_shared_yes;
757       break;
758     case ADDRESS_SPACE_CONST:
759       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
760                                                : NVPTX::cvta_const_yes_64)
761                          : NVPTX::cvta_const_yes;
762       break;
763     case ADDRESS_SPACE_LOCAL:
764       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
765                                                : NVPTX::cvta_local_yes_64)
766                          : NVPTX::cvta_local_yes;
767       break;
768     }
769     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
770                                           Src));
771     return;
772   } else {
773     // Generic to specific
774     if (SrcAddrSpace != 0)
775       report_fatal_error("Cannot cast between two non-generic address spaces");
776     unsigned Opc;
777     switch (DstAddrSpace) {
778     default: report_fatal_error("Bad address space in addrspacecast");
779     case ADDRESS_SPACE_GLOBAL:
780       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
781                          : NVPTX::cvta_to_global_yes;
782       break;
783     case ADDRESS_SPACE_SHARED:
784       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
785                                                 : NVPTX::cvta_to_shared_yes_64)
786                          : NVPTX::cvta_to_shared_yes;
787       break;
788     case ADDRESS_SPACE_CONST:
789       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
790                                              : NVPTX::cvta_to_const_yes_64)
791                          : NVPTX::cvta_to_const_yes;
792       break;
793     case ADDRESS_SPACE_LOCAL:
794       Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
795                                                : NVPTX::cvta_to_local_yes_64)
796                          : NVPTX::cvta_to_local_yes;
797       break;
798     case ADDRESS_SPACE_PARAM:
799       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
800                          : NVPTX::nvvm_ptr_gen_to_param;
801       break;
802     }
803     ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
804                                           Src));
805     return;
806   }
807 }
808 
809 // Helper function template to reduce amount of boilerplate code for
810 // opcode selection.
pickOpcodeForVT(MVT::SimpleValueType VT,unsigned Opcode_i8,unsigned Opcode_i16,unsigned Opcode_i32,Optional<unsigned> Opcode_i64,unsigned Opcode_f16,unsigned Opcode_f16x2,unsigned Opcode_f32,Optional<unsigned> Opcode_f64)811 static Optional<unsigned> pickOpcodeForVT(
812     MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
813     unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
814     unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
815   switch (VT) {
816   case MVT::i1:
817   case MVT::i8:
818     return Opcode_i8;
819   case MVT::i16:
820     return Opcode_i16;
821   case MVT::i32:
822     return Opcode_i32;
823   case MVT::i64:
824     return Opcode_i64;
825   case MVT::f16:
826     return Opcode_f16;
827   case MVT::v2f16:
828     return Opcode_f16x2;
829   case MVT::f32:
830     return Opcode_f32;
831   case MVT::f64:
832     return Opcode_f64;
833   default:
834     return None;
835   }
836 }
837 
tryLoad(SDNode * N)838 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
839   SDLoc dl(N);
840   MemSDNode *LD = cast<MemSDNode>(N);
841   assert(LD->readMem() && "Expected load");
842   LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
843   EVT LoadedVT = LD->getMemoryVT();
844   SDNode *NVPTXLD = nullptr;
845 
846   // do not support pre/post inc/dec
847   if (PlainLoad && PlainLoad->isIndexed())
848     return false;
849 
850   if (!LoadedVT.isSimple())
851     return false;
852 
853   AtomicOrdering Ordering = LD->getOrdering();
854   // In order to lower atomic loads with stronger guarantees we would need to
855   // use load.acquire or insert fences. However these features were only added
856   // with PTX ISA 6.0 / sm_70.
857   // TODO: Check if we can actually use the new instructions and implement them.
858   if (isStrongerThanMonotonic(Ordering))
859     return false;
860 
861   // Address Space Setting
862   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
863   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
864     return tryLDGLDU(N);
865   }
866 
867   unsigned int PointerSize =
868       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
869 
870   // Volatile Setting
871   // - .volatile is only available for .global and .shared
872   // - .volatile has the same memory synchronization semantics as .relaxed.sys
873   bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
874   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
875       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
876       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
877     isVolatile = false;
878 
879   // Type Setting: fromType + fromTypeWidth
880   //
881   // Sign   : ISD::SEXTLOAD
882   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
883   //          type is integer
884   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
885   MVT SimpleVT = LoadedVT.getSimpleVT();
886   MVT ScalarVT = SimpleVT.getScalarType();
887   // Read at least 8 bits (predicates are stored as 8-bit values)
888   unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
889   unsigned int fromType;
890 
891   // Vector Setting
892   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
893   if (SimpleVT.isVector()) {
894     assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
895     // v2f16 is loaded using ld.b32
896     fromTypeWidth = 32;
897   }
898 
899   if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
900     fromType = NVPTX::PTXLdStInstCode::Signed;
901   else if (ScalarVT.isFloatingPoint())
902     // f16 uses .b16 as its storage type.
903     fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
904                                              : NVPTX::PTXLdStInstCode::Float;
905   else
906     fromType = NVPTX::PTXLdStInstCode::Unsigned;
907 
908   // Create the machine instruction DAG
909   SDValue Chain = N->getOperand(0);
910   SDValue N1 = N->getOperand(1);
911   SDValue Addr;
912   SDValue Offset, Base;
913   Optional<unsigned> Opcode;
914   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
915 
916   if (SelectDirectAddr(N1, Addr)) {
917     Opcode = pickOpcodeForVT(
918         TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
919         NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
920         NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
921     if (!Opcode)
922       return false;
923     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
924                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
925                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
926     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
927                                      MVT::Other, Ops);
928   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
929                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
930     Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
931                                  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
932                                  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
933                                  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
934     if (!Opcode)
935       return false;
936     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
937                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
938                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
939     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
940                                      MVT::Other, Ops);
941   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
942                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
943     if (PointerSize == 64)
944       Opcode = pickOpcodeForVT(
945           TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
946           NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
947           NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
948     else
949       Opcode = pickOpcodeForVT(
950           TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
951           NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
952           NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
953     if (!Opcode)
954       return false;
955     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
956                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
957                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
958     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
959                                      MVT::Other, Ops);
960   } else {
961     if (PointerSize == 64)
962       Opcode = pickOpcodeForVT(
963           TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
964           NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
965           NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
966           NVPTX::LD_f64_areg_64);
967     else
968       Opcode = pickOpcodeForVT(
969           TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
970           NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
971           NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
972     if (!Opcode)
973       return false;
974     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
975                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
976                       getI32Imm(fromTypeWidth, dl), N1, Chain };
977     NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
978                                      MVT::Other, Ops);
979   }
980 
981   if (!NVPTXLD)
982     return false;
983 
984   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
985   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
986   cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
987 
988   ReplaceNode(N, NVPTXLD);
989   return true;
990 }
991 
tryLoadVector(SDNode * N)992 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
993 
994   SDValue Chain = N->getOperand(0);
995   SDValue Op1 = N->getOperand(1);
996   SDValue Addr, Offset, Base;
997   Optional<unsigned> Opcode;
998   SDLoc DL(N);
999   SDNode *LD;
1000   MemSDNode *MemSD = cast<MemSDNode>(N);
1001   EVT LoadedVT = MemSD->getMemoryVT();
1002 
1003   if (!LoadedVT.isSimple())
1004     return false;
1005 
1006   // Address Space Setting
1007   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1008   if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1009     return tryLDGLDU(N);
1010   }
1011 
1012   unsigned int PointerSize =
1013       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1014 
1015   // Volatile Setting
1016   // - .volatile is only availalble for .global and .shared
1017   bool IsVolatile = MemSD->isVolatile();
1018   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1019       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1020       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1021     IsVolatile = false;
1022 
1023   // Vector Setting
1024   MVT SimpleVT = LoadedVT.getSimpleVT();
1025 
1026   // Type Setting: fromType + fromTypeWidth
1027   //
1028   // Sign   : ISD::SEXTLOAD
1029   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1030   //          type is integer
1031   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1032   MVT ScalarVT = SimpleVT.getScalarType();
1033   // Read at least 8 bits (predicates are stored as 8-bit values)
1034   unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
1035   unsigned int FromType;
1036   // The last operand holds the original LoadSDNode::getExtensionType() value
1037   unsigned ExtensionType = cast<ConstantSDNode>(
1038       N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1039   if (ExtensionType == ISD::SEXTLOAD)
1040     FromType = NVPTX::PTXLdStInstCode::Signed;
1041   else if (ScalarVT.isFloatingPoint())
1042     FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1043                                              : NVPTX::PTXLdStInstCode::Float;
1044   else
1045     FromType = NVPTX::PTXLdStInstCode::Unsigned;
1046 
1047   unsigned VecType;
1048 
1049   switch (N->getOpcode()) {
1050   case NVPTXISD::LoadV2:
1051     VecType = NVPTX::PTXLdStInstCode::V2;
1052     break;
1053   case NVPTXISD::LoadV4:
1054     VecType = NVPTX::PTXLdStInstCode::V4;
1055     break;
1056   default:
1057     return false;
1058   }
1059 
1060   EVT EltVT = N->getValueType(0);
1061 
1062   // v8f16 is a special case. PTX doesn't have ld.v8.f16
1063   // instruction. Instead, we split the vector into v2f16 chunks and
1064   // load them with ld.v4.b32.
1065   if (EltVT == MVT::v2f16) {
1066     assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1067     EltVT = MVT::i32;
1068     FromType = NVPTX::PTXLdStInstCode::Untyped;
1069     FromTypeWidth = 32;
1070   }
1071 
1072   if (SelectDirectAddr(Op1, Addr)) {
1073     switch (N->getOpcode()) {
1074     default:
1075       return false;
1076     case NVPTXISD::LoadV2:
1077       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1078                                NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1079                                NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1080                                NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1081                                NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1082       break;
1083     case NVPTXISD::LoadV4:
1084       Opcode =
1085           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1086                           NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
1087                           NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1088                           NVPTX::LDV_f32_v4_avar, None);
1089       break;
1090     }
1091     if (!Opcode)
1092       return false;
1093     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1094                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1095                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
1096     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1097   } else if (PointerSize == 64
1098                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1099                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1100     switch (N->getOpcode()) {
1101     default:
1102       return false;
1103     case NVPTXISD::LoadV2:
1104       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1105                                NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1106                                NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1107                                NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1108                                NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1109       break;
1110     case NVPTXISD::LoadV4:
1111       Opcode =
1112           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1113                           NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
1114                           NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1115                           NVPTX::LDV_f32_v4_asi, None);
1116       break;
1117     }
1118     if (!Opcode)
1119       return false;
1120     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1121                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1122                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1123     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1124   } else if (PointerSize == 64
1125                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1126                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1127     if (PointerSize == 64) {
1128       switch (N->getOpcode()) {
1129       default:
1130         return false;
1131       case NVPTXISD::LoadV2:
1132         Opcode = pickOpcodeForVT(
1133             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1134             NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1135             NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1136             NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1137             NVPTX::LDV_f64_v2_ari_64);
1138         break;
1139       case NVPTXISD::LoadV4:
1140         Opcode = pickOpcodeForVT(
1141             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1142             NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
1143             NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1144             NVPTX::LDV_f32_v4_ari_64, None);
1145         break;
1146       }
1147     } else {
1148       switch (N->getOpcode()) {
1149       default:
1150         return false;
1151       case NVPTXISD::LoadV2:
1152         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1153                                  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1154                                  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1155                                  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1156                                  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1157         break;
1158       case NVPTXISD::LoadV4:
1159         Opcode =
1160             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1161                             NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
1162                             NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1163                             NVPTX::LDV_f32_v4_ari, None);
1164         break;
1165       }
1166     }
1167     if (!Opcode)
1168       return false;
1169     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1170                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1171                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1172 
1173     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1174   } else {
1175     if (PointerSize == 64) {
1176       switch (N->getOpcode()) {
1177       default:
1178         return false;
1179       case NVPTXISD::LoadV2:
1180         Opcode = pickOpcodeForVT(
1181             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1182             NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1183             NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1184             NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1185             NVPTX::LDV_f64_v2_areg_64);
1186         break;
1187       case NVPTXISD::LoadV4:
1188         Opcode = pickOpcodeForVT(
1189             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1190             NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
1191             NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1192             NVPTX::LDV_f32_v4_areg_64, None);
1193         break;
1194       }
1195     } else {
1196       switch (N->getOpcode()) {
1197       default:
1198         return false;
1199       case NVPTXISD::LoadV2:
1200         Opcode =
1201             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1202                             NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1203                             NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1204                             NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1205                             NVPTX::LDV_f64_v2_areg);
1206         break;
1207       case NVPTXISD::LoadV4:
1208         Opcode = pickOpcodeForVT(
1209             EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1210             NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
1211             NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1212             NVPTX::LDV_f32_v4_areg, None);
1213         break;
1214       }
1215     }
1216     if (!Opcode)
1217       return false;
1218     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1219                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1220                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
1221     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1222   }
1223 
1224   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
1225   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
1226   cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
1227 
1228   ReplaceNode(N, LD);
1229   return true;
1230 }
1231 
tryLDGLDU(SDNode * N)1232 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1233 
1234   SDValue Chain = N->getOperand(0);
1235   SDValue Op1;
1236   MemSDNode *Mem;
1237   bool IsLDG = true;
1238 
1239   // If this is an LDG intrinsic, the address is the third operand. If its an
1240   // LDG/LDU SD node (from custom vector handling), then its the second operand
1241   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1242     Op1 = N->getOperand(2);
1243     Mem = cast<MemIntrinsicSDNode>(N);
1244     unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1245     switch (IID) {
1246     default:
1247       return false;
1248     case Intrinsic::nvvm_ldg_global_f:
1249     case Intrinsic::nvvm_ldg_global_i:
1250     case Intrinsic::nvvm_ldg_global_p:
1251       IsLDG = true;
1252       break;
1253     case Intrinsic::nvvm_ldu_global_f:
1254     case Intrinsic::nvvm_ldu_global_i:
1255     case Intrinsic::nvvm_ldu_global_p:
1256       IsLDG = false;
1257       break;
1258     }
1259   } else {
1260     Op1 = N->getOperand(1);
1261     Mem = cast<MemSDNode>(N);
1262   }
1263 
1264   Optional<unsigned> Opcode;
1265   SDLoc DL(N);
1266   SDNode *LD;
1267   SDValue Base, Offset, Addr;
1268 
1269   EVT EltVT = Mem->getMemoryVT();
1270   unsigned NumElts = 1;
1271   if (EltVT.isVector()) {
1272     NumElts = EltVT.getVectorNumElements();
1273     EltVT = EltVT.getVectorElementType();
1274     // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1275     if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1276       assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1277       EltVT = MVT::v2f16;
1278       NumElts /= 2;
1279     }
1280   }
1281 
1282   // Build the "promoted" result VTList for the load. If we are really loading
1283   // i8s, then the return type will be promoted to i16 since we do not expose
1284   // 8-bit registers in NVPTX.
1285   EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1286   SmallVector<EVT, 5> InstVTs;
1287   for (unsigned i = 0; i != NumElts; ++i) {
1288     InstVTs.push_back(NodeVT);
1289   }
1290   InstVTs.push_back(MVT::Other);
1291   SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1292 
1293   if (SelectDirectAddr(Op1, Addr)) {
1294     switch (N->getOpcode()) {
1295     default:
1296       return false;
1297     case ISD::LOAD:
1298     case ISD::INTRINSIC_W_CHAIN:
1299       if (IsLDG)
1300         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1301                                      NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1302                                      NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1303                                      NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1304                                      NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1305                                      NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1306                                      NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1307                                      NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1308                                      NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1309       else
1310         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1311                                      NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1312                                      NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1313                                      NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1314                                      NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1315                                      NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1316                                      NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1317                                      NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1318                                      NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1319       break;
1320     case NVPTXISD::LoadV2:
1321     case NVPTXISD::LDGV2:
1322       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1323                                    NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1324                                    NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1325                                    NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1326                                    NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1327                                    NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1328                                    NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1329                                    NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1330                                    NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1331       break;
1332     case NVPTXISD::LDUV2:
1333       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1334                                    NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1335                                    NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1336                                    NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1337                                    NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1338                                    NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1339                                    NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1340                                    NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1341                                    NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1342       break;
1343     case NVPTXISD::LoadV4:
1344     case NVPTXISD::LDGV4:
1345       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1346                                NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1347                                NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1348                                NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
1349                                NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1350                                NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1351                                NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
1352       break;
1353     case NVPTXISD::LDUV4:
1354       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1355                                NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1356                                NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1357                                NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
1358                                NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1359                                NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1360                                NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
1361       break;
1362     }
1363     if (!Opcode)
1364       return false;
1365     SDValue Ops[] = { Addr, Chain };
1366     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1367   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1368                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1369     if (TM.is64Bit()) {
1370       switch (N->getOpcode()) {
1371       default:
1372         return false;
1373       case ISD::LOAD:
1374       case ISD::INTRINSIC_W_CHAIN:
1375         if (IsLDG)
1376           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1377                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1378                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1379                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1380                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1381                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1382                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1383                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1384                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1385         else
1386           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1387                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1388                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1389                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1390                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1391                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1392                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1393                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1394                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1395         break;
1396       case NVPTXISD::LoadV2:
1397       case NVPTXISD::LDGV2:
1398         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1399                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1400                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1401                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1402                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1403                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1404                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1405                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1406                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1407         break;
1408       case NVPTXISD::LDUV2:
1409         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1410                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1411                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1412                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1413                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1414                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1415                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1416                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1417                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1418         break;
1419       case NVPTXISD::LoadV4:
1420       case NVPTXISD::LDGV4:
1421         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1422                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1423                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1424                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
1425                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1426                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1427                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
1428         break;
1429       case NVPTXISD::LDUV4:
1430         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1431                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1432                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1433                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
1434                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1435                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1436                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
1437         break;
1438       }
1439     } else {
1440       switch (N->getOpcode()) {
1441       default:
1442         return false;
1443       case ISD::LOAD:
1444       case ISD::INTRINSIC_W_CHAIN:
1445         if (IsLDG)
1446           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1447                                        NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1448                                        NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1449                                        NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1450                                        NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1451                                        NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1452                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1453                                        NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1454                                        NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1455         else
1456           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1457                                        NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1458                                        NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1459                                        NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1460                                        NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1461                                        NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1462                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1463                                        NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1464                                        NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1465         break;
1466       case NVPTXISD::LoadV2:
1467       case NVPTXISD::LDGV2:
1468         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1469                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1470                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1471                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1472                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1473                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1474                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1475                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1476                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1477         break;
1478       case NVPTXISD::LDUV2:
1479         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1480                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1481                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1482                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1483                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1484                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1485                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1486                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1487                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1488         break;
1489       case NVPTXISD::LoadV4:
1490       case NVPTXISD::LDGV4:
1491         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1492                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1493                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1494                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
1495                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1496                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1497                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
1498         break;
1499       case NVPTXISD::LDUV4:
1500         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1501                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1502                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1503                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
1504                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1505                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1506                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
1507         break;
1508       }
1509     }
1510     if (!Opcode)
1511       return false;
1512     SDValue Ops[] = {Base, Offset, Chain};
1513     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1514   } else {
1515     if (TM.is64Bit()) {
1516       switch (N->getOpcode()) {
1517       default:
1518         return false;
1519       case ISD::LOAD:
1520       case ISD::INTRINSIC_W_CHAIN:
1521         if (IsLDG)
1522           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1523                                        NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1524                                        NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1525                                        NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1526                                        NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1527                                        NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1528                                        NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1529                                        NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1530                                        NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1531         else
1532           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1533                                        NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1534                                        NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1535                                        NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1536                                        NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1537                                        NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1538                                        NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1539                                        NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1540                                        NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1541         break;
1542       case NVPTXISD::LoadV2:
1543       case NVPTXISD::LDGV2:
1544         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1545                                      NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1546                                      NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1547                                      NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1548                                      NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1549                                      NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1550                                      NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1551                                      NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1552                                      NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1553         break;
1554       case NVPTXISD::LDUV2:
1555         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1556                                      NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1557                                      NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1558                                      NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1559                                      NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1560                                      NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1561                                      NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1562                                      NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1563                                      NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1564         break;
1565       case NVPTXISD::LoadV4:
1566       case NVPTXISD::LDGV4:
1567         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1568                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1569                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1570                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
1571                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1572                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1573                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
1574         break;
1575       case NVPTXISD::LDUV4:
1576         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1577                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1578                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1579                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
1580                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1581                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1582                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
1583         break;
1584       }
1585     } else {
1586       switch (N->getOpcode()) {
1587       default:
1588         return false;
1589       case ISD::LOAD:
1590       case ISD::INTRINSIC_W_CHAIN:
1591         if (IsLDG)
1592           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1593                                    NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1594                                    NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1595                                    NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1596                                    NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1597                                    NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1598                                    NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1599                                    NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1600                                    NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1601         else
1602           Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1603                                    NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1604                                    NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1605                                    NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1606                                    NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1607                                    NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1608                                    NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1609                                    NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1610                                    NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1611         break;
1612       case NVPTXISD::LoadV2:
1613       case NVPTXISD::LDGV2:
1614         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1615                                  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1616                                  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1617                                  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1618                                  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1619                                  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1620                                  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1621                                  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1622                                  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1623         break;
1624       case NVPTXISD::LDUV2:
1625         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1626                                  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1627                                  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1628                                  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1629                                  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1630                                  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1631                                  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1632                                  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1633                                  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1634         break;
1635       case NVPTXISD::LoadV4:
1636       case NVPTXISD::LDGV4:
1637         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1638                                  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1639                                  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1640                                  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
1641                                  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1642                                  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1643                                  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
1644         break;
1645       case NVPTXISD::LDUV4:
1646         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1647                                  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1648                                  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1649                                  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
1650                                  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1651                                  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1652                                  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
1653         break;
1654       }
1655     }
1656     if (!Opcode)
1657       return false;
1658     SDValue Ops[] = { Op1, Chain };
1659     LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1660   }
1661 
1662   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
1663   MemRefs0[0] = Mem->getMemOperand();
1664   cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
1665 
1666   // For automatic generation of LDG (through SelectLoad[Vector], not the
1667   // intrinsics), we may have an extending load like:
1668   //
1669   //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1670   //
1671   // In this case, the matching logic above will select a load for the original
1672   // memory type (in this case, i8) and our types will not match (the node needs
1673   // to return an i32 in this case). Our LDG/LDU nodes do not support the
1674   // concept of sign-/zero-extension, so emulate it here by adding an explicit
1675   // CVT instruction. Ptxas should clean up any redundancies here.
1676 
1677   EVT OrigType = N->getValueType(0);
1678   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1679 
1680   if (OrigType != EltVT && LdNode) {
1681     // We have an extending-load. The instruction we selected operates on the
1682     // smaller type, but the SDNode we are replacing has the larger type. We
1683     // need to emit a CVT to make the types match.
1684     bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1685     unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1686                                        EltVT.getSimpleVT(), IsSigned);
1687 
1688     // For each output value, apply the manual sign/zero-extension and make sure
1689     // all users of the load go through that CVT.
1690     for (unsigned i = 0; i != NumElts; ++i) {
1691       SDValue Res(LD, i);
1692       SDValue OrigVal(N, i);
1693 
1694       SDNode *CvtNode =
1695         CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1696                                CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1697                                                          DL, MVT::i32));
1698       ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1699     }
1700   }
1701 
1702   ReplaceNode(N, LD);
1703   return true;
1704 }
1705 
tryStore(SDNode * N)1706 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1707   SDLoc dl(N);
1708   MemSDNode *ST = cast<MemSDNode>(N);
1709   assert(ST->writeMem() && "Expected store");
1710   StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1711   AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1712   assert((PlainStore || AtomicStore) && "Expected store");
1713   EVT StoreVT = ST->getMemoryVT();
1714   SDNode *NVPTXST = nullptr;
1715 
1716   // do not support pre/post inc/dec
1717   if (PlainStore && PlainStore->isIndexed())
1718     return false;
1719 
1720   if (!StoreVT.isSimple())
1721     return false;
1722 
1723   AtomicOrdering Ordering = ST->getOrdering();
1724   // In order to lower atomic loads with stronger guarantees we would need to
1725   // use store.release or insert fences. However these features were only added
1726   // with PTX ISA 6.0 / sm_70.
1727   // TODO: Check if we can actually use the new instructions and implement them.
1728   if (isStrongerThanMonotonic(Ordering))
1729     return false;
1730 
1731   // Address Space Setting
1732   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1733   unsigned int PointerSize =
1734       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1735 
1736   // Volatile Setting
1737   // - .volatile is only available for .global and .shared
1738   // - .volatile has the same memory synchronization semantics as .relaxed.sys
1739   bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1740   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1741       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1742       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1743     isVolatile = false;
1744 
1745   // Vector Setting
1746   MVT SimpleVT = StoreVT.getSimpleVT();
1747   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1748 
1749   // Type Setting: toType + toTypeWidth
1750   // - for integer type, always use 'u'
1751   //
1752   MVT ScalarVT = SimpleVT.getScalarType();
1753   unsigned toTypeWidth = ScalarVT.getSizeInBits();
1754   if (SimpleVT.isVector()) {
1755     assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
1756     // v2f16 is stored using st.b32
1757     toTypeWidth = 32;
1758   }
1759 
1760   unsigned int toType;
1761   if (ScalarVT.isFloatingPoint())
1762     // f16 uses .b16 as its storage type.
1763     toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1764                                            : NVPTX::PTXLdStInstCode::Float;
1765   else
1766     toType = NVPTX::PTXLdStInstCode::Unsigned;
1767 
1768   // Create the machine instruction DAG
1769   SDValue Chain = ST->getChain();
1770   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1771   SDValue BasePtr = ST->getBasePtr();
1772   SDValue Addr;
1773   SDValue Offset, Base;
1774   Optional<unsigned> Opcode;
1775   MVT::SimpleValueType SourceVT =
1776       Value.getNode()->getSimpleValueType(0).SimpleTy;
1777 
1778   if (SelectDirectAddr(BasePtr, Addr)) {
1779     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1780                              NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1781                              NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1782                              NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1783     if (!Opcode)
1784       return false;
1785     SDValue Ops[] = {Value,
1786                      getI32Imm(isVolatile, dl),
1787                      getI32Imm(CodeAddrSpace, dl),
1788                      getI32Imm(vecType, dl),
1789                      getI32Imm(toType, dl),
1790                      getI32Imm(toTypeWidth, dl),
1791                      Addr,
1792                      Chain};
1793     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1794   } else if (PointerSize == 64
1795                  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1796                  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1797     Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1798                              NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1799                              NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1800                              NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1801     if (!Opcode)
1802       return false;
1803     SDValue Ops[] = {Value,
1804                      getI32Imm(isVolatile, dl),
1805                      getI32Imm(CodeAddrSpace, dl),
1806                      getI32Imm(vecType, dl),
1807                      getI32Imm(toType, dl),
1808                      getI32Imm(toTypeWidth, dl),
1809                      Base,
1810                      Offset,
1811                      Chain};
1812     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1813   } else if (PointerSize == 64
1814                  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1815                  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1816     if (PointerSize == 64)
1817       Opcode = pickOpcodeForVT(
1818           SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1819           NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1820           NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1821     else
1822       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1823                                NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1824                                NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1825                                NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1826     if (!Opcode)
1827       return false;
1828 
1829     SDValue Ops[] = {Value,
1830                      getI32Imm(isVolatile, dl),
1831                      getI32Imm(CodeAddrSpace, dl),
1832                      getI32Imm(vecType, dl),
1833                      getI32Imm(toType, dl),
1834                      getI32Imm(toTypeWidth, dl),
1835                      Base,
1836                      Offset,
1837                      Chain};
1838     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1839   } else {
1840     if (PointerSize == 64)
1841       Opcode =
1842           pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1843                           NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1844                           NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1845                           NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1846     else
1847       Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1848                                NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1849                                NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1850                                NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1851     if (!Opcode)
1852       return false;
1853     SDValue Ops[] = {Value,
1854                      getI32Imm(isVolatile, dl),
1855                      getI32Imm(CodeAddrSpace, dl),
1856                      getI32Imm(vecType, dl),
1857                      getI32Imm(toType, dl),
1858                      getI32Imm(toTypeWidth, dl),
1859                      BasePtr,
1860                      Chain};
1861     NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1862   }
1863 
1864   if (!NVPTXST)
1865     return false;
1866 
1867   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
1868   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
1869   cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
1870   ReplaceNode(N, NVPTXST);
1871   return true;
1872 }
1873 
tryStoreVector(SDNode * N)1874 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1875   SDValue Chain = N->getOperand(0);
1876   SDValue Op1 = N->getOperand(1);
1877   SDValue Addr, Offset, Base;
1878   Optional<unsigned> Opcode;
1879   SDLoc DL(N);
1880   SDNode *ST;
1881   EVT EltVT = Op1.getValueType();
1882   MemSDNode *MemSD = cast<MemSDNode>(N);
1883   EVT StoreVT = MemSD->getMemoryVT();
1884 
1885   // Address Space Setting
1886   unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1887   if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1888     report_fatal_error("Cannot store to pointer that points to constant "
1889                        "memory space");
1890   }
1891   unsigned int PointerSize =
1892       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1893 
1894   // Volatile Setting
1895   // - .volatile is only availalble for .global and .shared
1896   bool IsVolatile = MemSD->isVolatile();
1897   if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1898       CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1899       CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1900     IsVolatile = false;
1901 
1902   // Type Setting: toType + toTypeWidth
1903   // - for integer type, always use 'u'
1904   assert(StoreVT.isSimple() && "Store value is not simple");
1905   MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1906   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1907   unsigned ToType;
1908   if (ScalarVT.isFloatingPoint())
1909     ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1910                                            : NVPTX::PTXLdStInstCode::Float;
1911   else
1912     ToType = NVPTX::PTXLdStInstCode::Unsigned;
1913 
1914   SmallVector<SDValue, 12> StOps;
1915   SDValue N2;
1916   unsigned VecType;
1917 
1918   switch (N->getOpcode()) {
1919   case NVPTXISD::StoreV2:
1920     VecType = NVPTX::PTXLdStInstCode::V2;
1921     StOps.push_back(N->getOperand(1));
1922     StOps.push_back(N->getOperand(2));
1923     N2 = N->getOperand(3);
1924     break;
1925   case NVPTXISD::StoreV4:
1926     VecType = NVPTX::PTXLdStInstCode::V4;
1927     StOps.push_back(N->getOperand(1));
1928     StOps.push_back(N->getOperand(2));
1929     StOps.push_back(N->getOperand(3));
1930     StOps.push_back(N->getOperand(4));
1931     N2 = N->getOperand(5);
1932     break;
1933   default:
1934     return false;
1935   }
1936 
1937   // v8f16 is a special case. PTX doesn't have st.v8.f16
1938   // instruction. Instead, we split the vector into v2f16 chunks and
1939   // store them with st.v4.b32.
1940   if (EltVT == MVT::v2f16) {
1941     assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1942     EltVT = MVT::i32;
1943     ToType = NVPTX::PTXLdStInstCode::Untyped;
1944     ToTypeWidth = 32;
1945   }
1946 
1947   StOps.push_back(getI32Imm(IsVolatile, DL));
1948   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1949   StOps.push_back(getI32Imm(VecType, DL));
1950   StOps.push_back(getI32Imm(ToType, DL));
1951   StOps.push_back(getI32Imm(ToTypeWidth, DL));
1952 
1953   if (SelectDirectAddr(N2, Addr)) {
1954     switch (N->getOpcode()) {
1955     default:
1956       return false;
1957     case NVPTXISD::StoreV2:
1958       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1959                                NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1960                                NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1961                                NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1962                                NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1963       break;
1964     case NVPTXISD::StoreV4:
1965       Opcode =
1966           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
1967                           NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
1968                           NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1969                           NVPTX::STV_f32_v4_avar, None);
1970       break;
1971     }
1972     StOps.push_back(Addr);
1973   } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1974                                : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1975     switch (N->getOpcode()) {
1976     default:
1977       return false;
1978     case NVPTXISD::StoreV2:
1979       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1980                                NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1981                                NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1982                                NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1983                                NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1984       break;
1985     case NVPTXISD::StoreV4:
1986       Opcode =
1987           pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1988                           NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
1989                           NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1990                           NVPTX::STV_f32_v4_asi, None);
1991       break;
1992     }
1993     StOps.push_back(Base);
1994     StOps.push_back(Offset);
1995   } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1996                                : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1997     if (PointerSize == 64) {
1998       switch (N->getOpcode()) {
1999       default:
2000         return false;
2001       case NVPTXISD::StoreV2:
2002         Opcode = pickOpcodeForVT(
2003             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2004             NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2005             NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2006             NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2007             NVPTX::STV_f64_v2_ari_64);
2008         break;
2009       case NVPTXISD::StoreV4:
2010         Opcode = pickOpcodeForVT(
2011             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2012             NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
2013             NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2014             NVPTX::STV_f32_v4_ari_64, None);
2015         break;
2016       }
2017     } else {
2018       switch (N->getOpcode()) {
2019       default:
2020         return false;
2021       case NVPTXISD::StoreV2:
2022         Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2023                                  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2024                                  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2025                                  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2026                                  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2027         break;
2028       case NVPTXISD::StoreV4:
2029         Opcode =
2030             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
2031                             NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
2032                             NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2033                             NVPTX::STV_f32_v4_ari, None);
2034         break;
2035       }
2036     }
2037     StOps.push_back(Base);
2038     StOps.push_back(Offset);
2039   } else {
2040     if (PointerSize == 64) {
2041       switch (N->getOpcode()) {
2042       default:
2043         return false;
2044       case NVPTXISD::StoreV2:
2045         Opcode = pickOpcodeForVT(
2046             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2047             NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2048             NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2049             NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2050             NVPTX::STV_f64_v2_areg_64);
2051         break;
2052       case NVPTXISD::StoreV4:
2053         Opcode = pickOpcodeForVT(
2054             EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2055             NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
2056             NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2057             NVPTX::STV_f32_v4_areg_64, None);
2058         break;
2059       }
2060     } else {
2061       switch (N->getOpcode()) {
2062       default:
2063         return false;
2064       case NVPTXISD::StoreV2:
2065         Opcode =
2066             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2067                             NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2068                             NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2069                             NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2070                             NVPTX::STV_f64_v2_areg);
2071         break;
2072       case NVPTXISD::StoreV4:
2073         Opcode =
2074             pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2075                             NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
2076                             NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2077                             NVPTX::STV_f32_v4_areg, None);
2078         break;
2079       }
2080     }
2081     StOps.push_back(N2);
2082   }
2083 
2084   if (!Opcode)
2085     return false;
2086 
2087   StOps.push_back(Chain);
2088 
2089   ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
2090 
2091   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
2092   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2093   cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
2094 
2095   ReplaceNode(N, ST);
2096   return true;
2097 }
2098 
tryLoadParam(SDNode * Node)2099 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2100   SDValue Chain = Node->getOperand(0);
2101   SDValue Offset = Node->getOperand(2);
2102   SDValue Flag = Node->getOperand(3);
2103   SDLoc DL(Node);
2104   MemSDNode *Mem = cast<MemSDNode>(Node);
2105 
2106   unsigned VecSize;
2107   switch (Node->getOpcode()) {
2108   default:
2109     return false;
2110   case NVPTXISD::LoadParam:
2111     VecSize = 1;
2112     break;
2113   case NVPTXISD::LoadParamV2:
2114     VecSize = 2;
2115     break;
2116   case NVPTXISD::LoadParamV4:
2117     VecSize = 4;
2118     break;
2119   }
2120 
2121   EVT EltVT = Node->getValueType(0);
2122   EVT MemVT = Mem->getMemoryVT();
2123 
2124   Optional<unsigned> Opcode;
2125 
2126   switch (VecSize) {
2127   default:
2128     return false;
2129   case 1:
2130     Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2131                              NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2132                              NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2133                              NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2134                              NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2135     break;
2136   case 2:
2137     Opcode =
2138         pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2139                         NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2140                         NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2141                         NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2142                         NVPTX::LoadParamMemV2F64);
2143     break;
2144   case 4:
2145     Opcode = pickOpcodeForVT(
2146         MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2147         NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
2148         NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2149         NVPTX::LoadParamMemV4F32, None);
2150     break;
2151   }
2152   if (!Opcode)
2153     return false;
2154 
2155   SDVTList VTs;
2156   if (VecSize == 1) {
2157     VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2158   } else if (VecSize == 2) {
2159     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2160   } else {
2161     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2162     VTs = CurDAG->getVTList(EVTs);
2163   }
2164 
2165   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2166 
2167   SmallVector<SDValue, 2> Ops;
2168   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2169   Ops.push_back(Chain);
2170   Ops.push_back(Flag);
2171 
2172   ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
2173   return true;
2174 }
2175 
tryStoreRetval(SDNode * N)2176 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2177   SDLoc DL(N);
2178   SDValue Chain = N->getOperand(0);
2179   SDValue Offset = N->getOperand(1);
2180   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2181   MemSDNode *Mem = cast<MemSDNode>(N);
2182 
2183   // How many elements do we have?
2184   unsigned NumElts = 1;
2185   switch (N->getOpcode()) {
2186   default:
2187     return false;
2188   case NVPTXISD::StoreRetval:
2189     NumElts = 1;
2190     break;
2191   case NVPTXISD::StoreRetvalV2:
2192     NumElts = 2;
2193     break;
2194   case NVPTXISD::StoreRetvalV4:
2195     NumElts = 4;
2196     break;
2197   }
2198 
2199   // Build vector of operands
2200   SmallVector<SDValue, 6> Ops;
2201   for (unsigned i = 0; i < NumElts; ++i)
2202     Ops.push_back(N->getOperand(i + 2));
2203   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2204   Ops.push_back(Chain);
2205 
2206   // Determine target opcode
2207   // If we have an i1, use an 8-bit store. The lowering code in
2208   // NVPTXISelLowering will have already emitted an upcast.
2209   Optional<unsigned> Opcode = 0;
2210   switch (NumElts) {
2211   default:
2212     return false;
2213   case 1:
2214     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2215                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2216                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2217                              NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2218                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2219     break;
2220   case 2:
2221     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2222                              NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2223                              NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2224                              NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2225                              NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2226     break;
2227   case 4:
2228     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2229                              NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2230                              NVPTX::StoreRetvalV4I32, None,
2231                              NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2232                              NVPTX::StoreRetvalV4F32, None);
2233     break;
2234   }
2235   if (!Opcode)
2236     return false;
2237 
2238   SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
2239   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
2240   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2241   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
2242 
2243   ReplaceNode(N, Ret);
2244   return true;
2245 }
2246 
tryStoreParam(SDNode * N)2247 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2248   SDLoc DL(N);
2249   SDValue Chain = N->getOperand(0);
2250   SDValue Param = N->getOperand(1);
2251   unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2252   SDValue Offset = N->getOperand(2);
2253   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2254   MemSDNode *Mem = cast<MemSDNode>(N);
2255   SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2256 
2257   // How many elements do we have?
2258   unsigned NumElts = 1;
2259   switch (N->getOpcode()) {
2260   default:
2261     return false;
2262   case NVPTXISD::StoreParamU32:
2263   case NVPTXISD::StoreParamS32:
2264   case NVPTXISD::StoreParam:
2265     NumElts = 1;
2266     break;
2267   case NVPTXISD::StoreParamV2:
2268     NumElts = 2;
2269     break;
2270   case NVPTXISD::StoreParamV4:
2271     NumElts = 4;
2272     break;
2273   }
2274 
2275   // Build vector of operands
2276   SmallVector<SDValue, 8> Ops;
2277   for (unsigned i = 0; i < NumElts; ++i)
2278     Ops.push_back(N->getOperand(i + 3));
2279   Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2280   Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2281   Ops.push_back(Chain);
2282   Ops.push_back(Flag);
2283 
2284   // Determine target opcode
2285   // If we have an i1, use an 8-bit store. The lowering code in
2286   // NVPTXISelLowering will have already emitted an upcast.
2287   Optional<unsigned> Opcode = 0;
2288   switch (N->getOpcode()) {
2289   default:
2290     switch (NumElts) {
2291     default:
2292       return false;
2293     case 1:
2294       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2295                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2296                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2297                                NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2298                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2299       break;
2300     case 2:
2301       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2302                                NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2303                                NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2304                                NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2305                                NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2306       break;
2307     case 4:
2308       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2309                                NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2310                                NVPTX::StoreParamV4I32, None,
2311                                NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2312                                NVPTX::StoreParamV4F32, None);
2313       break;
2314     }
2315     if (!Opcode)
2316       return false;
2317     break;
2318   // Special case: if we have a sign-extend/zero-extend node, insert the
2319   // conversion instruction first, and use that as the value operand to
2320   // the selected StoreParam node.
2321   case NVPTXISD::StoreParamU32: {
2322     Opcode = NVPTX::StoreParamI32;
2323     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2324                                                 MVT::i32);
2325     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2326                                          MVT::i32, Ops[0], CvtNone);
2327     Ops[0] = SDValue(Cvt, 0);
2328     break;
2329   }
2330   case NVPTXISD::StoreParamS32: {
2331     Opcode = NVPTX::StoreParamI32;
2332     SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2333                                                 MVT::i32);
2334     SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2335                                          MVT::i32, Ops[0], CvtNone);
2336     Ops[0] = SDValue(Cvt, 0);
2337     break;
2338   }
2339   }
2340 
2341   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2342   SDNode *Ret =
2343       CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
2344   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
2345   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2346   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
2347 
2348   ReplaceNode(N, Ret);
2349   return true;
2350 }
2351 
tryTextureIntrinsic(SDNode * N)2352 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2353   unsigned Opc = 0;
2354 
2355   switch (N->getOpcode()) {
2356   default: return false;
2357   case NVPTXISD::Tex1DFloatS32:
2358     Opc = NVPTX::TEX_1D_F32_S32;
2359     break;
2360   case NVPTXISD::Tex1DFloatFloat:
2361     Opc = NVPTX::TEX_1D_F32_F32;
2362     break;
2363   case NVPTXISD::Tex1DFloatFloatLevel:
2364     Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
2365     break;
2366   case NVPTXISD::Tex1DFloatFloatGrad:
2367     Opc = NVPTX::TEX_1D_F32_F32_GRAD;
2368     break;
2369   case NVPTXISD::Tex1DS32S32:
2370     Opc = NVPTX::TEX_1D_S32_S32;
2371     break;
2372   case NVPTXISD::Tex1DS32Float:
2373     Opc = NVPTX::TEX_1D_S32_F32;
2374     break;
2375   case NVPTXISD::Tex1DS32FloatLevel:
2376     Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
2377     break;
2378   case NVPTXISD::Tex1DS32FloatGrad:
2379     Opc = NVPTX::TEX_1D_S32_F32_GRAD;
2380     break;
2381   case NVPTXISD::Tex1DU32S32:
2382     Opc = NVPTX::TEX_1D_U32_S32;
2383     break;
2384   case NVPTXISD::Tex1DU32Float:
2385     Opc = NVPTX::TEX_1D_U32_F32;
2386     break;
2387   case NVPTXISD::Tex1DU32FloatLevel:
2388     Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
2389     break;
2390   case NVPTXISD::Tex1DU32FloatGrad:
2391     Opc = NVPTX::TEX_1D_U32_F32_GRAD;
2392     break;
2393   case NVPTXISD::Tex1DArrayFloatS32:
2394     Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
2395     break;
2396   case NVPTXISD::Tex1DArrayFloatFloat:
2397     Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
2398     break;
2399   case NVPTXISD::Tex1DArrayFloatFloatLevel:
2400     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
2401     break;
2402   case NVPTXISD::Tex1DArrayFloatFloatGrad:
2403     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
2404     break;
2405   case NVPTXISD::Tex1DArrayS32S32:
2406     Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
2407     break;
2408   case NVPTXISD::Tex1DArrayS32Float:
2409     Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
2410     break;
2411   case NVPTXISD::Tex1DArrayS32FloatLevel:
2412     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
2413     break;
2414   case NVPTXISD::Tex1DArrayS32FloatGrad:
2415     Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
2416     break;
2417   case NVPTXISD::Tex1DArrayU32S32:
2418     Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
2419     break;
2420   case NVPTXISD::Tex1DArrayU32Float:
2421     Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
2422     break;
2423   case NVPTXISD::Tex1DArrayU32FloatLevel:
2424     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
2425     break;
2426   case NVPTXISD::Tex1DArrayU32FloatGrad:
2427     Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
2428     break;
2429   case NVPTXISD::Tex2DFloatS32:
2430     Opc = NVPTX::TEX_2D_F32_S32;
2431     break;
2432   case NVPTXISD::Tex2DFloatFloat:
2433     Opc = NVPTX::TEX_2D_F32_F32;
2434     break;
2435   case NVPTXISD::Tex2DFloatFloatLevel:
2436     Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
2437     break;
2438   case NVPTXISD::Tex2DFloatFloatGrad:
2439     Opc = NVPTX::TEX_2D_F32_F32_GRAD;
2440     break;
2441   case NVPTXISD::Tex2DS32S32:
2442     Opc = NVPTX::TEX_2D_S32_S32;
2443     break;
2444   case NVPTXISD::Tex2DS32Float:
2445     Opc = NVPTX::TEX_2D_S32_F32;
2446     break;
2447   case NVPTXISD::Tex2DS32FloatLevel:
2448     Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
2449     break;
2450   case NVPTXISD::Tex2DS32FloatGrad:
2451     Opc = NVPTX::TEX_2D_S32_F32_GRAD;
2452     break;
2453   case NVPTXISD::Tex2DU32S32:
2454     Opc = NVPTX::TEX_2D_U32_S32;
2455     break;
2456   case NVPTXISD::Tex2DU32Float:
2457     Opc = NVPTX::TEX_2D_U32_F32;
2458     break;
2459   case NVPTXISD::Tex2DU32FloatLevel:
2460     Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
2461     break;
2462   case NVPTXISD::Tex2DU32FloatGrad:
2463     Opc = NVPTX::TEX_2D_U32_F32_GRAD;
2464     break;
2465   case NVPTXISD::Tex2DArrayFloatS32:
2466     Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
2467     break;
2468   case NVPTXISD::Tex2DArrayFloatFloat:
2469     Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
2470     break;
2471   case NVPTXISD::Tex2DArrayFloatFloatLevel:
2472     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
2473     break;
2474   case NVPTXISD::Tex2DArrayFloatFloatGrad:
2475     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
2476     break;
2477   case NVPTXISD::Tex2DArrayS32S32:
2478     Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
2479     break;
2480   case NVPTXISD::Tex2DArrayS32Float:
2481     Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
2482     break;
2483   case NVPTXISD::Tex2DArrayS32FloatLevel:
2484     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
2485     break;
2486   case NVPTXISD::Tex2DArrayS32FloatGrad:
2487     Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
2488     break;
2489   case NVPTXISD::Tex2DArrayU32S32:
2490     Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
2491     break;
2492   case NVPTXISD::Tex2DArrayU32Float:
2493     Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
2494     break;
2495   case NVPTXISD::Tex2DArrayU32FloatLevel:
2496     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
2497     break;
2498   case NVPTXISD::Tex2DArrayU32FloatGrad:
2499     Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
2500     break;
2501   case NVPTXISD::Tex3DFloatS32:
2502     Opc = NVPTX::TEX_3D_F32_S32;
2503     break;
2504   case NVPTXISD::Tex3DFloatFloat:
2505     Opc = NVPTX::TEX_3D_F32_F32;
2506     break;
2507   case NVPTXISD::Tex3DFloatFloatLevel:
2508     Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
2509     break;
2510   case NVPTXISD::Tex3DFloatFloatGrad:
2511     Opc = NVPTX::TEX_3D_F32_F32_GRAD;
2512     break;
2513   case NVPTXISD::Tex3DS32S32:
2514     Opc = NVPTX::TEX_3D_S32_S32;
2515     break;
2516   case NVPTXISD::Tex3DS32Float:
2517     Opc = NVPTX::TEX_3D_S32_F32;
2518     break;
2519   case NVPTXISD::Tex3DS32FloatLevel:
2520     Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
2521     break;
2522   case NVPTXISD::Tex3DS32FloatGrad:
2523     Opc = NVPTX::TEX_3D_S32_F32_GRAD;
2524     break;
2525   case NVPTXISD::Tex3DU32S32:
2526     Opc = NVPTX::TEX_3D_U32_S32;
2527     break;
2528   case NVPTXISD::Tex3DU32Float:
2529     Opc = NVPTX::TEX_3D_U32_F32;
2530     break;
2531   case NVPTXISD::Tex3DU32FloatLevel:
2532     Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
2533     break;
2534   case NVPTXISD::Tex3DU32FloatGrad:
2535     Opc = NVPTX::TEX_3D_U32_F32_GRAD;
2536     break;
2537   case NVPTXISD::TexCubeFloatFloat:
2538     Opc = NVPTX::TEX_CUBE_F32_F32;
2539     break;
2540   case NVPTXISD::TexCubeFloatFloatLevel:
2541     Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
2542     break;
2543   case NVPTXISD::TexCubeS32Float:
2544     Opc = NVPTX::TEX_CUBE_S32_F32;
2545     break;
2546   case NVPTXISD::TexCubeS32FloatLevel:
2547     Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
2548     break;
2549   case NVPTXISD::TexCubeU32Float:
2550     Opc = NVPTX::TEX_CUBE_U32_F32;
2551     break;
2552   case NVPTXISD::TexCubeU32FloatLevel:
2553     Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
2554     break;
2555   case NVPTXISD::TexCubeArrayFloatFloat:
2556     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
2557     break;
2558   case NVPTXISD::TexCubeArrayFloatFloatLevel:
2559     Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
2560     break;
2561   case NVPTXISD::TexCubeArrayS32Float:
2562     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
2563     break;
2564   case NVPTXISD::TexCubeArrayS32FloatLevel:
2565     Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
2566     break;
2567   case NVPTXISD::TexCubeArrayU32Float:
2568     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
2569     break;
2570   case NVPTXISD::TexCubeArrayU32FloatLevel:
2571     Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
2572     break;
2573   case NVPTXISD::Tld4R2DFloatFloat:
2574     Opc = NVPTX::TLD4_R_2D_F32_F32;
2575     break;
2576   case NVPTXISD::Tld4G2DFloatFloat:
2577     Opc = NVPTX::TLD4_G_2D_F32_F32;
2578     break;
2579   case NVPTXISD::Tld4B2DFloatFloat:
2580     Opc = NVPTX::TLD4_B_2D_F32_F32;
2581     break;
2582   case NVPTXISD::Tld4A2DFloatFloat:
2583     Opc = NVPTX::TLD4_A_2D_F32_F32;
2584     break;
2585   case NVPTXISD::Tld4R2DS64Float:
2586     Opc = NVPTX::TLD4_R_2D_S32_F32;
2587     break;
2588   case NVPTXISD::Tld4G2DS64Float:
2589     Opc = NVPTX::TLD4_G_2D_S32_F32;
2590     break;
2591   case NVPTXISD::Tld4B2DS64Float:
2592     Opc = NVPTX::TLD4_B_2D_S32_F32;
2593     break;
2594   case NVPTXISD::Tld4A2DS64Float:
2595     Opc = NVPTX::TLD4_A_2D_S32_F32;
2596     break;
2597   case NVPTXISD::Tld4R2DU64Float:
2598     Opc = NVPTX::TLD4_R_2D_U32_F32;
2599     break;
2600   case NVPTXISD::Tld4G2DU64Float:
2601     Opc = NVPTX::TLD4_G_2D_U32_F32;
2602     break;
2603   case NVPTXISD::Tld4B2DU64Float:
2604     Opc = NVPTX::TLD4_B_2D_U32_F32;
2605     break;
2606   case NVPTXISD::Tld4A2DU64Float:
2607     Opc = NVPTX::TLD4_A_2D_U32_F32;
2608     break;
2609   case NVPTXISD::TexUnified1DFloatS32:
2610     Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
2611     break;
2612   case NVPTXISD::TexUnified1DFloatFloat:
2613     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
2614     break;
2615   case NVPTXISD::TexUnified1DFloatFloatLevel:
2616     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
2617     break;
2618   case NVPTXISD::TexUnified1DFloatFloatGrad:
2619     Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
2620     break;
2621   case NVPTXISD::TexUnified1DS32S32:
2622     Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
2623     break;
2624   case NVPTXISD::TexUnified1DS32Float:
2625     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
2626     break;
2627   case NVPTXISD::TexUnified1DS32FloatLevel:
2628     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
2629     break;
2630   case NVPTXISD::TexUnified1DS32FloatGrad:
2631     Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
2632     break;
2633   case NVPTXISD::TexUnified1DU32S32:
2634     Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
2635     break;
2636   case NVPTXISD::TexUnified1DU32Float:
2637     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
2638     break;
2639   case NVPTXISD::TexUnified1DU32FloatLevel:
2640     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
2641     break;
2642   case NVPTXISD::TexUnified1DU32FloatGrad:
2643     Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
2644     break;
2645   case NVPTXISD::TexUnified1DArrayFloatS32:
2646     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
2647     break;
2648   case NVPTXISD::TexUnified1DArrayFloatFloat:
2649     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
2650     break;
2651   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2652     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
2653     break;
2654   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2655     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
2656     break;
2657   case NVPTXISD::TexUnified1DArrayS32S32:
2658     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
2659     break;
2660   case NVPTXISD::TexUnified1DArrayS32Float:
2661     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
2662     break;
2663   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2664     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
2665     break;
2666   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2667     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
2668     break;
2669   case NVPTXISD::TexUnified1DArrayU32S32:
2670     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
2671     break;
2672   case NVPTXISD::TexUnified1DArrayU32Float:
2673     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
2674     break;
2675   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2676     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
2677     break;
2678   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2679     Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
2680     break;
2681   case NVPTXISD::TexUnified2DFloatS32:
2682     Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
2683     break;
2684   case NVPTXISD::TexUnified2DFloatFloat:
2685     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
2686     break;
2687   case NVPTXISD::TexUnified2DFloatFloatLevel:
2688     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
2689     break;
2690   case NVPTXISD::TexUnified2DFloatFloatGrad:
2691     Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
2692     break;
2693   case NVPTXISD::TexUnified2DS32S32:
2694     Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
2695     break;
2696   case NVPTXISD::TexUnified2DS32Float:
2697     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
2698     break;
2699   case NVPTXISD::TexUnified2DS32FloatLevel:
2700     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
2701     break;
2702   case NVPTXISD::TexUnified2DS32FloatGrad:
2703     Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
2704     break;
2705   case NVPTXISD::TexUnified2DU32S32:
2706     Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
2707     break;
2708   case NVPTXISD::TexUnified2DU32Float:
2709     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
2710     break;
2711   case NVPTXISD::TexUnified2DU32FloatLevel:
2712     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
2713     break;
2714   case NVPTXISD::TexUnified2DU32FloatGrad:
2715     Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
2716     break;
2717   case NVPTXISD::TexUnified2DArrayFloatS32:
2718     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
2719     break;
2720   case NVPTXISD::TexUnified2DArrayFloatFloat:
2721     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
2722     break;
2723   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2724     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
2725     break;
2726   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2727     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
2728     break;
2729   case NVPTXISD::TexUnified2DArrayS32S32:
2730     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
2731     break;
2732   case NVPTXISD::TexUnified2DArrayS32Float:
2733     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
2734     break;
2735   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2736     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
2737     break;
2738   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2739     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
2740     break;
2741   case NVPTXISD::TexUnified2DArrayU32S32:
2742     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
2743     break;
2744   case NVPTXISD::TexUnified2DArrayU32Float:
2745     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
2746     break;
2747   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2748     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
2749     break;
2750   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2751     Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
2752     break;
2753   case NVPTXISD::TexUnified3DFloatS32:
2754     Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
2755     break;
2756   case NVPTXISD::TexUnified3DFloatFloat:
2757     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
2758     break;
2759   case NVPTXISD::TexUnified3DFloatFloatLevel:
2760     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
2761     break;
2762   case NVPTXISD::TexUnified3DFloatFloatGrad:
2763     Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
2764     break;
2765   case NVPTXISD::TexUnified3DS32S32:
2766     Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
2767     break;
2768   case NVPTXISD::TexUnified3DS32Float:
2769     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
2770     break;
2771   case NVPTXISD::TexUnified3DS32FloatLevel:
2772     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
2773     break;
2774   case NVPTXISD::TexUnified3DS32FloatGrad:
2775     Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
2776     break;
2777   case NVPTXISD::TexUnified3DU32S32:
2778     Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
2779     break;
2780   case NVPTXISD::TexUnified3DU32Float:
2781     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
2782     break;
2783   case NVPTXISD::TexUnified3DU32FloatLevel:
2784     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
2785     break;
2786   case NVPTXISD::TexUnified3DU32FloatGrad:
2787     Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
2788     break;
2789   case NVPTXISD::TexUnifiedCubeFloatFloat:
2790     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
2791     break;
2792   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2793     Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
2794     break;
2795   case NVPTXISD::TexUnifiedCubeS32Float:
2796     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
2797     break;
2798   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2799     Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
2800     break;
2801   case NVPTXISD::TexUnifiedCubeU32Float:
2802     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
2803     break;
2804   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2805     Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
2806     break;
2807   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2808     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
2809     break;
2810   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2811     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
2812     break;
2813   case NVPTXISD::TexUnifiedCubeArrayS32Float:
2814     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
2815     break;
2816   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2817     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
2818     break;
2819   case NVPTXISD::TexUnifiedCubeArrayU32Float:
2820     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
2821     break;
2822   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2823     Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
2824     break;
2825   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2826     Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
2827     break;
2828   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2829     Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
2830     break;
2831   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2832     Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
2833     break;
2834   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2835     Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
2836     break;
2837   case NVPTXISD::Tld4UnifiedR2DS64Float:
2838     Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
2839     break;
2840   case NVPTXISD::Tld4UnifiedG2DS64Float:
2841     Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
2842     break;
2843   case NVPTXISD::Tld4UnifiedB2DS64Float:
2844     Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
2845     break;
2846   case NVPTXISD::Tld4UnifiedA2DS64Float:
2847     Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
2848     break;
2849   case NVPTXISD::Tld4UnifiedR2DU64Float:
2850     Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
2851     break;
2852   case NVPTXISD::Tld4UnifiedG2DU64Float:
2853     Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
2854     break;
2855   case NVPTXISD::Tld4UnifiedB2DU64Float:
2856     Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
2857     break;
2858   case NVPTXISD::Tld4UnifiedA2DU64Float:
2859     Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
2860     break;
2861   }
2862 
2863   // Copy over operands
2864   SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
2865   Ops.push_back(N->getOperand(0)); // Move chain to the back.
2866 
2867   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2868   return true;
2869 }
2870 
trySurfaceIntrinsic(SDNode * N)2871 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2872   unsigned Opc = 0;
2873   switch (N->getOpcode()) {
2874   default: return false;
2875   case NVPTXISD::Suld1DI8Clamp:
2876     Opc = NVPTX::SULD_1D_I8_CLAMP;
2877     break;
2878   case NVPTXISD::Suld1DI16Clamp:
2879     Opc = NVPTX::SULD_1D_I16_CLAMP;
2880     break;
2881   case NVPTXISD::Suld1DI32Clamp:
2882     Opc = NVPTX::SULD_1D_I32_CLAMP;
2883     break;
2884   case NVPTXISD::Suld1DI64Clamp:
2885     Opc = NVPTX::SULD_1D_I64_CLAMP;
2886     break;
2887   case NVPTXISD::Suld1DV2I8Clamp:
2888     Opc = NVPTX::SULD_1D_V2I8_CLAMP;
2889     break;
2890   case NVPTXISD::Suld1DV2I16Clamp:
2891     Opc = NVPTX::SULD_1D_V2I16_CLAMP;
2892     break;
2893   case NVPTXISD::Suld1DV2I32Clamp:
2894     Opc = NVPTX::SULD_1D_V2I32_CLAMP;
2895     break;
2896   case NVPTXISD::Suld1DV2I64Clamp:
2897     Opc = NVPTX::SULD_1D_V2I64_CLAMP;
2898     break;
2899   case NVPTXISD::Suld1DV4I8Clamp:
2900     Opc = NVPTX::SULD_1D_V4I8_CLAMP;
2901     break;
2902   case NVPTXISD::Suld1DV4I16Clamp:
2903     Opc = NVPTX::SULD_1D_V4I16_CLAMP;
2904     break;
2905   case NVPTXISD::Suld1DV4I32Clamp:
2906     Opc = NVPTX::SULD_1D_V4I32_CLAMP;
2907     break;
2908   case NVPTXISD::Suld1DArrayI8Clamp:
2909     Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
2910     break;
2911   case NVPTXISD::Suld1DArrayI16Clamp:
2912     Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
2913     break;
2914   case NVPTXISD::Suld1DArrayI32Clamp:
2915     Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
2916     break;
2917   case NVPTXISD::Suld1DArrayI64Clamp:
2918     Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
2919     break;
2920   case NVPTXISD::Suld1DArrayV2I8Clamp:
2921     Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
2922     break;
2923   case NVPTXISD::Suld1DArrayV2I16Clamp:
2924     Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
2925     break;
2926   case NVPTXISD::Suld1DArrayV2I32Clamp:
2927     Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
2928     break;
2929   case NVPTXISD::Suld1DArrayV2I64Clamp:
2930     Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
2931     break;
2932   case NVPTXISD::Suld1DArrayV4I8Clamp:
2933     Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
2934     break;
2935   case NVPTXISD::Suld1DArrayV4I16Clamp:
2936     Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
2937     break;
2938   case NVPTXISD::Suld1DArrayV4I32Clamp:
2939     Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
2940     break;
2941   case NVPTXISD::Suld2DI8Clamp:
2942     Opc = NVPTX::SULD_2D_I8_CLAMP;
2943     break;
2944   case NVPTXISD::Suld2DI16Clamp:
2945     Opc = NVPTX::SULD_2D_I16_CLAMP;
2946     break;
2947   case NVPTXISD::Suld2DI32Clamp:
2948     Opc = NVPTX::SULD_2D_I32_CLAMP;
2949     break;
2950   case NVPTXISD::Suld2DI64Clamp:
2951     Opc = NVPTX::SULD_2D_I64_CLAMP;
2952     break;
2953   case NVPTXISD::Suld2DV2I8Clamp:
2954     Opc = NVPTX::SULD_2D_V2I8_CLAMP;
2955     break;
2956   case NVPTXISD::Suld2DV2I16Clamp:
2957     Opc = NVPTX::SULD_2D_V2I16_CLAMP;
2958     break;
2959   case NVPTXISD::Suld2DV2I32Clamp:
2960     Opc = NVPTX::SULD_2D_V2I32_CLAMP;
2961     break;
2962   case NVPTXISD::Suld2DV2I64Clamp:
2963     Opc = NVPTX::SULD_2D_V2I64_CLAMP;
2964     break;
2965   case NVPTXISD::Suld2DV4I8Clamp:
2966     Opc = NVPTX::SULD_2D_V4I8_CLAMP;
2967     break;
2968   case NVPTXISD::Suld2DV4I16Clamp:
2969     Opc = NVPTX::SULD_2D_V4I16_CLAMP;
2970     break;
2971   case NVPTXISD::Suld2DV4I32Clamp:
2972     Opc = NVPTX::SULD_2D_V4I32_CLAMP;
2973     break;
2974   case NVPTXISD::Suld2DArrayI8Clamp:
2975     Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
2976     break;
2977   case NVPTXISD::Suld2DArrayI16Clamp:
2978     Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
2979     break;
2980   case NVPTXISD::Suld2DArrayI32Clamp:
2981     Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
2982     break;
2983   case NVPTXISD::Suld2DArrayI64Clamp:
2984     Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
2985     break;
2986   case NVPTXISD::Suld2DArrayV2I8Clamp:
2987     Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
2988     break;
2989   case NVPTXISD::Suld2DArrayV2I16Clamp:
2990     Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
2991     break;
2992   case NVPTXISD::Suld2DArrayV2I32Clamp:
2993     Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
2994     break;
2995   case NVPTXISD::Suld2DArrayV2I64Clamp:
2996     Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
2997     break;
2998   case NVPTXISD::Suld2DArrayV4I8Clamp:
2999     Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
3000     break;
3001   case NVPTXISD::Suld2DArrayV4I16Clamp:
3002     Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
3003     break;
3004   case NVPTXISD::Suld2DArrayV4I32Clamp:
3005     Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
3006     break;
3007   case NVPTXISD::Suld3DI8Clamp:
3008     Opc = NVPTX::SULD_3D_I8_CLAMP;
3009     break;
3010   case NVPTXISD::Suld3DI16Clamp:
3011     Opc = NVPTX::SULD_3D_I16_CLAMP;
3012     break;
3013   case NVPTXISD::Suld3DI32Clamp:
3014     Opc = NVPTX::SULD_3D_I32_CLAMP;
3015     break;
3016   case NVPTXISD::Suld3DI64Clamp:
3017     Opc = NVPTX::SULD_3D_I64_CLAMP;
3018     break;
3019   case NVPTXISD::Suld3DV2I8Clamp:
3020     Opc = NVPTX::SULD_3D_V2I8_CLAMP;
3021     break;
3022   case NVPTXISD::Suld3DV2I16Clamp:
3023     Opc = NVPTX::SULD_3D_V2I16_CLAMP;
3024     break;
3025   case NVPTXISD::Suld3DV2I32Clamp:
3026     Opc = NVPTX::SULD_3D_V2I32_CLAMP;
3027     break;
3028   case NVPTXISD::Suld3DV2I64Clamp:
3029     Opc = NVPTX::SULD_3D_V2I64_CLAMP;
3030     break;
3031   case NVPTXISD::Suld3DV4I8Clamp:
3032     Opc = NVPTX::SULD_3D_V4I8_CLAMP;
3033     break;
3034   case NVPTXISD::Suld3DV4I16Clamp:
3035     Opc = NVPTX::SULD_3D_V4I16_CLAMP;
3036     break;
3037   case NVPTXISD::Suld3DV4I32Clamp:
3038     Opc = NVPTX::SULD_3D_V4I32_CLAMP;
3039     break;
3040   case NVPTXISD::Suld1DI8Trap:
3041     Opc = NVPTX::SULD_1D_I8_TRAP;
3042     break;
3043   case NVPTXISD::Suld1DI16Trap:
3044     Opc = NVPTX::SULD_1D_I16_TRAP;
3045     break;
3046   case NVPTXISD::Suld1DI32Trap:
3047     Opc = NVPTX::SULD_1D_I32_TRAP;
3048     break;
3049   case NVPTXISD::Suld1DI64Trap:
3050     Opc = NVPTX::SULD_1D_I64_TRAP;
3051     break;
3052   case NVPTXISD::Suld1DV2I8Trap:
3053     Opc = NVPTX::SULD_1D_V2I8_TRAP;
3054     break;
3055   case NVPTXISD::Suld1DV2I16Trap:
3056     Opc = NVPTX::SULD_1D_V2I16_TRAP;
3057     break;
3058   case NVPTXISD::Suld1DV2I32Trap:
3059     Opc = NVPTX::SULD_1D_V2I32_TRAP;
3060     break;
3061   case NVPTXISD::Suld1DV2I64Trap:
3062     Opc = NVPTX::SULD_1D_V2I64_TRAP;
3063     break;
3064   case NVPTXISD::Suld1DV4I8Trap:
3065     Opc = NVPTX::SULD_1D_V4I8_TRAP;
3066     break;
3067   case NVPTXISD::Suld1DV4I16Trap:
3068     Opc = NVPTX::SULD_1D_V4I16_TRAP;
3069     break;
3070   case NVPTXISD::Suld1DV4I32Trap:
3071     Opc = NVPTX::SULD_1D_V4I32_TRAP;
3072     break;
3073   case NVPTXISD::Suld1DArrayI8Trap:
3074     Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
3075     break;
3076   case NVPTXISD::Suld1DArrayI16Trap:
3077     Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
3078     break;
3079   case NVPTXISD::Suld1DArrayI32Trap:
3080     Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
3081     break;
3082   case NVPTXISD::Suld1DArrayI64Trap:
3083     Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
3084     break;
3085   case NVPTXISD::Suld1DArrayV2I8Trap:
3086     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
3087     break;
3088   case NVPTXISD::Suld1DArrayV2I16Trap:
3089     Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
3090     break;
3091   case NVPTXISD::Suld1DArrayV2I32Trap:
3092     Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
3093     break;
3094   case NVPTXISD::Suld1DArrayV2I64Trap:
3095     Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
3096     break;
3097   case NVPTXISD::Suld1DArrayV4I8Trap:
3098     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
3099     break;
3100   case NVPTXISD::Suld1DArrayV4I16Trap:
3101     Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
3102     break;
3103   case NVPTXISD::Suld1DArrayV4I32Trap:
3104     Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
3105     break;
3106   case NVPTXISD::Suld2DI8Trap:
3107     Opc = NVPTX::SULD_2D_I8_TRAP;
3108     break;
3109   case NVPTXISD::Suld2DI16Trap:
3110     Opc = NVPTX::SULD_2D_I16_TRAP;
3111     break;
3112   case NVPTXISD::Suld2DI32Trap:
3113     Opc = NVPTX::SULD_2D_I32_TRAP;
3114     break;
3115   case NVPTXISD::Suld2DI64Trap:
3116     Opc = NVPTX::SULD_2D_I64_TRAP;
3117     break;
3118   case NVPTXISD::Suld2DV2I8Trap:
3119     Opc = NVPTX::SULD_2D_V2I8_TRAP;
3120     break;
3121   case NVPTXISD::Suld2DV2I16Trap:
3122     Opc = NVPTX::SULD_2D_V2I16_TRAP;
3123     break;
3124   case NVPTXISD::Suld2DV2I32Trap:
3125     Opc = NVPTX::SULD_2D_V2I32_TRAP;
3126     break;
3127   case NVPTXISD::Suld2DV2I64Trap:
3128     Opc = NVPTX::SULD_2D_V2I64_TRAP;
3129     break;
3130   case NVPTXISD::Suld2DV4I8Trap:
3131     Opc = NVPTX::SULD_2D_V4I8_TRAP;
3132     break;
3133   case NVPTXISD::Suld2DV4I16Trap:
3134     Opc = NVPTX::SULD_2D_V4I16_TRAP;
3135     break;
3136   case NVPTXISD::Suld2DV4I32Trap:
3137     Opc = NVPTX::SULD_2D_V4I32_TRAP;
3138     break;
3139   case NVPTXISD::Suld2DArrayI8Trap:
3140     Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
3141     break;
3142   case NVPTXISD::Suld2DArrayI16Trap:
3143     Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
3144     break;
3145   case NVPTXISD::Suld2DArrayI32Trap:
3146     Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
3147     break;
3148   case NVPTXISD::Suld2DArrayI64Trap:
3149     Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
3150     break;
3151   case NVPTXISD::Suld2DArrayV2I8Trap:
3152     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
3153     break;
3154   case NVPTXISD::Suld2DArrayV2I16Trap:
3155     Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
3156     break;
3157   case NVPTXISD::Suld2DArrayV2I32Trap:
3158     Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
3159     break;
3160   case NVPTXISD::Suld2DArrayV2I64Trap:
3161     Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
3162     break;
3163   case NVPTXISD::Suld2DArrayV4I8Trap:
3164     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
3165     break;
3166   case NVPTXISD::Suld2DArrayV4I16Trap:
3167     Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
3168     break;
3169   case NVPTXISD::Suld2DArrayV4I32Trap:
3170     Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
3171     break;
3172   case NVPTXISD::Suld3DI8Trap:
3173     Opc = NVPTX::SULD_3D_I8_TRAP;
3174     break;
3175   case NVPTXISD::Suld3DI16Trap:
3176     Opc = NVPTX::SULD_3D_I16_TRAP;
3177     break;
3178   case NVPTXISD::Suld3DI32Trap:
3179     Opc = NVPTX::SULD_3D_I32_TRAP;
3180     break;
3181   case NVPTXISD::Suld3DI64Trap:
3182     Opc = NVPTX::SULD_3D_I64_TRAP;
3183     break;
3184   case NVPTXISD::Suld3DV2I8Trap:
3185     Opc = NVPTX::SULD_3D_V2I8_TRAP;
3186     break;
3187   case NVPTXISD::Suld3DV2I16Trap:
3188     Opc = NVPTX::SULD_3D_V2I16_TRAP;
3189     break;
3190   case NVPTXISD::Suld3DV2I32Trap:
3191     Opc = NVPTX::SULD_3D_V2I32_TRAP;
3192     break;
3193   case NVPTXISD::Suld3DV2I64Trap:
3194     Opc = NVPTX::SULD_3D_V2I64_TRAP;
3195     break;
3196   case NVPTXISD::Suld3DV4I8Trap:
3197     Opc = NVPTX::SULD_3D_V4I8_TRAP;
3198     break;
3199   case NVPTXISD::Suld3DV4I16Trap:
3200     Opc = NVPTX::SULD_3D_V4I16_TRAP;
3201     break;
3202   case NVPTXISD::Suld3DV4I32Trap:
3203     Opc = NVPTX::SULD_3D_V4I32_TRAP;
3204     break;
3205   case NVPTXISD::Suld1DI8Zero:
3206     Opc = NVPTX::SULD_1D_I8_ZERO;
3207     break;
3208   case NVPTXISD::Suld1DI16Zero:
3209     Opc = NVPTX::SULD_1D_I16_ZERO;
3210     break;
3211   case NVPTXISD::Suld1DI32Zero:
3212     Opc = NVPTX::SULD_1D_I32_ZERO;
3213     break;
3214   case NVPTXISD::Suld1DI64Zero:
3215     Opc = NVPTX::SULD_1D_I64_ZERO;
3216     break;
3217   case NVPTXISD::Suld1DV2I8Zero:
3218     Opc = NVPTX::SULD_1D_V2I8_ZERO;
3219     break;
3220   case NVPTXISD::Suld1DV2I16Zero:
3221     Opc = NVPTX::SULD_1D_V2I16_ZERO;
3222     break;
3223   case NVPTXISD::Suld1DV2I32Zero:
3224     Opc = NVPTX::SULD_1D_V2I32_ZERO;
3225     break;
3226   case NVPTXISD::Suld1DV2I64Zero:
3227     Opc = NVPTX::SULD_1D_V2I64_ZERO;
3228     break;
3229   case NVPTXISD::Suld1DV4I8Zero:
3230     Opc = NVPTX::SULD_1D_V4I8_ZERO;
3231     break;
3232   case NVPTXISD::Suld1DV4I16Zero:
3233     Opc = NVPTX::SULD_1D_V4I16_ZERO;
3234     break;
3235   case NVPTXISD::Suld1DV4I32Zero:
3236     Opc = NVPTX::SULD_1D_V4I32_ZERO;
3237     break;
3238   case NVPTXISD::Suld1DArrayI8Zero:
3239     Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
3240     break;
3241   case NVPTXISD::Suld1DArrayI16Zero:
3242     Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
3243     break;
3244   case NVPTXISD::Suld1DArrayI32Zero:
3245     Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
3246     break;
3247   case NVPTXISD::Suld1DArrayI64Zero:
3248     Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
3249     break;
3250   case NVPTXISD::Suld1DArrayV2I8Zero:
3251     Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
3252     break;
3253   case NVPTXISD::Suld1DArrayV2I16Zero:
3254     Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
3255     break;
3256   case NVPTXISD::Suld1DArrayV2I32Zero:
3257     Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
3258     break;
3259   case NVPTXISD::Suld1DArrayV2I64Zero:
3260     Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
3261     break;
3262   case NVPTXISD::Suld1DArrayV4I8Zero:
3263     Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
3264     break;
3265   case NVPTXISD::Suld1DArrayV4I16Zero:
3266     Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
3267     break;
3268   case NVPTXISD::Suld1DArrayV4I32Zero:
3269     Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
3270     break;
3271   case NVPTXISD::Suld2DI8Zero:
3272     Opc = NVPTX::SULD_2D_I8_ZERO;
3273     break;
3274   case NVPTXISD::Suld2DI16Zero:
3275     Opc = NVPTX::SULD_2D_I16_ZERO;
3276     break;
3277   case NVPTXISD::Suld2DI32Zero:
3278     Opc = NVPTX::SULD_2D_I32_ZERO;
3279     break;
3280   case NVPTXISD::Suld2DI64Zero:
3281     Opc = NVPTX::SULD_2D_I64_ZERO;
3282     break;
3283   case NVPTXISD::Suld2DV2I8Zero:
3284     Opc = NVPTX::SULD_2D_V2I8_ZERO;
3285     break;
3286   case NVPTXISD::Suld2DV2I16Zero:
3287     Opc = NVPTX::SULD_2D_V2I16_ZERO;
3288     break;
3289   case NVPTXISD::Suld2DV2I32Zero:
3290     Opc = NVPTX::SULD_2D_V2I32_ZERO;
3291     break;
3292   case NVPTXISD::Suld2DV2I64Zero:
3293     Opc = NVPTX::SULD_2D_V2I64_ZERO;
3294     break;
3295   case NVPTXISD::Suld2DV4I8Zero:
3296     Opc = NVPTX::SULD_2D_V4I8_ZERO;
3297     break;
3298   case NVPTXISD::Suld2DV4I16Zero:
3299     Opc = NVPTX::SULD_2D_V4I16_ZERO;
3300     break;
3301   case NVPTXISD::Suld2DV4I32Zero:
3302     Opc = NVPTX::SULD_2D_V4I32_ZERO;
3303     break;
3304   case NVPTXISD::Suld2DArrayI8Zero:
3305     Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
3306     break;
3307   case NVPTXISD::Suld2DArrayI16Zero:
3308     Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
3309     break;
3310   case NVPTXISD::Suld2DArrayI32Zero:
3311     Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
3312     break;
3313   case NVPTXISD::Suld2DArrayI64Zero:
3314     Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
3315     break;
3316   case NVPTXISD::Suld2DArrayV2I8Zero:
3317     Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
3318     break;
3319   case NVPTXISD::Suld2DArrayV2I16Zero:
3320     Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
3321     break;
3322   case NVPTXISD::Suld2DArrayV2I32Zero:
3323     Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
3324     break;
3325   case NVPTXISD::Suld2DArrayV2I64Zero:
3326     Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
3327     break;
3328   case NVPTXISD::Suld2DArrayV4I8Zero:
3329     Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
3330     break;
3331   case NVPTXISD::Suld2DArrayV4I16Zero:
3332     Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
3333     break;
3334   case NVPTXISD::Suld2DArrayV4I32Zero:
3335     Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
3336     break;
3337   case NVPTXISD::Suld3DI8Zero:
3338     Opc = NVPTX::SULD_3D_I8_ZERO;
3339     break;
3340   case NVPTXISD::Suld3DI16Zero:
3341     Opc = NVPTX::SULD_3D_I16_ZERO;
3342     break;
3343   case NVPTXISD::Suld3DI32Zero:
3344     Opc = NVPTX::SULD_3D_I32_ZERO;
3345     break;
3346   case NVPTXISD::Suld3DI64Zero:
3347     Opc = NVPTX::SULD_3D_I64_ZERO;
3348     break;
3349   case NVPTXISD::Suld3DV2I8Zero:
3350     Opc = NVPTX::SULD_3D_V2I8_ZERO;
3351     break;
3352   case NVPTXISD::Suld3DV2I16Zero:
3353     Opc = NVPTX::SULD_3D_V2I16_ZERO;
3354     break;
3355   case NVPTXISD::Suld3DV2I32Zero:
3356     Opc = NVPTX::SULD_3D_V2I32_ZERO;
3357     break;
3358   case NVPTXISD::Suld3DV2I64Zero:
3359     Opc = NVPTX::SULD_3D_V2I64_ZERO;
3360     break;
3361   case NVPTXISD::Suld3DV4I8Zero:
3362     Opc = NVPTX::SULD_3D_V4I8_ZERO;
3363     break;
3364   case NVPTXISD::Suld3DV4I16Zero:
3365     Opc = NVPTX::SULD_3D_V4I16_ZERO;
3366     break;
3367   case NVPTXISD::Suld3DV4I32Zero:
3368     Opc = NVPTX::SULD_3D_V4I32_ZERO;
3369     break;
3370   }
3371 
3372   // Copy over operands
3373   SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
3374   Ops.push_back(N->getOperand(0)); // Move chain to the back.
3375 
3376   ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3377   return true;
3378 }
3379 
3380 
3381 /// SelectBFE - Look for instruction sequences that can be made more efficient
3382 /// by using the 'bfe' (bit-field extract) PTX instruction
tryBFE(SDNode * N)3383 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3384   SDLoc DL(N);
3385   SDValue LHS = N->getOperand(0);
3386   SDValue RHS = N->getOperand(1);
3387   SDValue Len;
3388   SDValue Start;
3389   SDValue Val;
3390   bool IsSigned = false;
3391 
3392   if (N->getOpcode() == ISD::AND) {
3393     // Canonicalize the operands
3394     // We want 'and %val, %mask'
3395     if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3396       std::swap(LHS, RHS);
3397     }
3398 
3399     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3400     if (!Mask) {
3401       // We need a constant mask on the RHS of the AND
3402       return false;
3403     }
3404 
3405     // Extract the mask bits
3406     uint64_t MaskVal = Mask->getZExtValue();
3407     if (!isMask_64(MaskVal)) {
3408       // We *could* handle shifted masks here, but doing so would require an
3409       // 'and' operation to fix up the low-order bits so we would trade
3410       // shr+and for bfe+and, which has the same throughput
3411       return false;
3412     }
3413 
3414     // How many bits are in our mask?
3415     uint64_t NumBits = countTrailingOnes(MaskVal);
3416     Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3417 
3418     if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3419       // We have a 'srl/and' pair, extract the effective start bit and length
3420       Val = LHS.getNode()->getOperand(0);
3421       Start = LHS.getNode()->getOperand(1);
3422       ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3423       if (StartConst) {
3424         uint64_t StartVal = StartConst->getZExtValue();
3425         // How many "good" bits do we have left?  "good" is defined here as bits
3426         // that exist in the original value, not shifted in.
3427         uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3428         if (NumBits > GoodBits) {
3429           // Do not handle the case where bits have been shifted in. In theory
3430           // we could handle this, but the cost is likely higher than just
3431           // emitting the srl/and pair.
3432           return false;
3433         }
3434         Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3435       } else {
3436         // Do not handle the case where the shift amount (can be zero if no srl
3437         // was found) is not constant. We could handle this case, but it would
3438         // require run-time logic that would be more expensive than just
3439         // emitting the srl/and pair.
3440         return false;
3441       }
3442     } else {
3443       // Do not handle the case where the LHS of the and is not a shift. While
3444       // it would be trivial to handle this case, it would just transform
3445       // 'and' -> 'bfe', but 'and' has higher-throughput.
3446       return false;
3447     }
3448   } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3449     if (LHS->getOpcode() == ISD::AND) {
3450       ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3451       if (!ShiftCnst) {
3452         // Shift amount must be constant
3453         return false;
3454       }
3455 
3456       uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3457 
3458       SDValue AndLHS = LHS->getOperand(0);
3459       SDValue AndRHS = LHS->getOperand(1);
3460 
3461       // Canonicalize the AND to have the mask on the RHS
3462       if (isa<ConstantSDNode>(AndLHS)) {
3463         std::swap(AndLHS, AndRHS);
3464       }
3465 
3466       ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3467       if (!MaskCnst) {
3468         // Mask must be constant
3469         return false;
3470       }
3471 
3472       uint64_t MaskVal = MaskCnst->getZExtValue();
3473       uint64_t NumZeros;
3474       uint64_t NumBits;
3475       if (isMask_64(MaskVal)) {
3476         NumZeros = 0;
3477         // The number of bits in the result bitfield will be the number of
3478         // trailing ones (the AND) minus the number of bits we shift off
3479         NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3480       } else if (isShiftedMask_64(MaskVal)) {
3481         NumZeros = countTrailingZeros(MaskVal);
3482         unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3483         // The number of bits in the result bitfield will be the number of
3484         // trailing zeros plus the number of set bits in the mask minus the
3485         // number of bits we shift off
3486         NumBits = NumZeros + NumOnes - ShiftAmt;
3487       } else {
3488         // This is not a mask we can handle
3489         return false;
3490       }
3491 
3492       if (ShiftAmt < NumZeros) {
3493         // Handling this case would require extra logic that would make this
3494         // transformation non-profitable
3495         return false;
3496       }
3497 
3498       Val = AndLHS;
3499       Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3500       Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3501     } else if (LHS->getOpcode() == ISD::SHL) {
3502       // Here, we have a pattern like:
3503       //
3504       // (sra (shl val, NN), MM)
3505       // or
3506       // (srl (shl val, NN), MM)
3507       //
3508       // If MM >= NN, we can efficiently optimize this with bfe
3509       Val = LHS->getOperand(0);
3510 
3511       SDValue ShlRHS = LHS->getOperand(1);
3512       ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3513       if (!ShlCnst) {
3514         // Shift amount must be constant
3515         return false;
3516       }
3517       uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3518 
3519       SDValue ShrRHS = RHS;
3520       ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3521       if (!ShrCnst) {
3522         // Shift amount must be constant
3523         return false;
3524       }
3525       uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3526 
3527       // To avoid extra codegen and be profitable, we need Outer >= Inner
3528       if (OuterShiftAmt < InnerShiftAmt) {
3529         return false;
3530       }
3531 
3532       // If the outer shift is more than the type size, we have no bitfield to
3533       // extract (since we also check that the inner shift is <= the outer shift
3534       // then this also implies that the inner shift is < the type size)
3535       if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3536         return false;
3537       }
3538 
3539       Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3540                                         MVT::i32);
3541       Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3542                                       DL, MVT::i32);
3543 
3544       if (N->getOpcode() == ISD::SRA) {
3545         // If we have a arithmetic right shift, we need to use the signed bfe
3546         // variant
3547         IsSigned = true;
3548       }
3549     } else {
3550       // No can do...
3551       return false;
3552     }
3553   } else {
3554     // No can do...
3555     return false;
3556   }
3557 
3558 
3559   unsigned Opc;
3560   // For the BFE operations we form here from "and" and "srl", always use the
3561   // unsigned variants.
3562   if (Val.getValueType() == MVT::i32) {
3563     if (IsSigned) {
3564       Opc = NVPTX::BFE_S32rii;
3565     } else {
3566       Opc = NVPTX::BFE_U32rii;
3567     }
3568   } else if (Val.getValueType() == MVT::i64) {
3569     if (IsSigned) {
3570       Opc = NVPTX::BFE_S64rii;
3571     } else {
3572       Opc = NVPTX::BFE_U64rii;
3573     }
3574   } else {
3575     // We cannot handle this type
3576     return false;
3577   }
3578 
3579   SDValue Ops[] = {
3580     Val, Start, Len
3581   };
3582 
3583   ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3584   return true;
3585 }
3586 
3587 // SelectDirectAddr - Match a direct address for DAG.
3588 // A direct address could be a globaladdress or externalsymbol.
SelectDirectAddr(SDValue N,SDValue & Address)3589 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3590   // Return true if TGA or ES.
3591   if (N.getOpcode() == ISD::TargetGlobalAddress ||
3592       N.getOpcode() == ISD::TargetExternalSymbol) {
3593     Address = N;
3594     return true;
3595   }
3596   if (N.getOpcode() == NVPTXISD::Wrapper) {
3597     Address = N.getOperand(0);
3598     return true;
3599   }
3600   // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3601   if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3602     if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3603         CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3604         CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3605       return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3606   }
3607   return false;
3608 }
3609 
3610 // symbol+offset
SelectADDRsi_imp(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset,MVT mvt)3611 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3612     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3613   if (Addr.getOpcode() == ISD::ADD) {
3614     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3615       SDValue base = Addr.getOperand(0);
3616       if (SelectDirectAddr(base, Base)) {
3617         Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3618                                            mvt);
3619         return true;
3620       }
3621     }
3622   }
3623   return false;
3624 }
3625 
3626 // symbol+offset
SelectADDRsi(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3627 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3628                                      SDValue &Base, SDValue &Offset) {
3629   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3630 }
3631 
3632 // symbol+offset
SelectADDRsi64(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3633 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3634                                        SDValue &Base, SDValue &Offset) {
3635   return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3636 }
3637 
3638 // register+offset
SelectADDRri_imp(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset,MVT mvt)3639 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3640     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3641   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3642     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3643     Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3644     return true;
3645   }
3646   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3647       Addr.getOpcode() == ISD::TargetGlobalAddress)
3648     return false; // direct calls.
3649 
3650   if (Addr.getOpcode() == ISD::ADD) {
3651     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3652       return false;
3653     }
3654     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3655       if (FrameIndexSDNode *FIN =
3656               dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3657         // Constant offset from frame ref.
3658         Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3659       else
3660         Base = Addr.getOperand(0);
3661       Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3662                                          mvt);
3663       return true;
3664     }
3665   }
3666   return false;
3667 }
3668 
3669 // register+offset
SelectADDRri(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3670 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3671                                      SDValue &Base, SDValue &Offset) {
3672   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3673 }
3674 
3675 // register+offset
SelectADDRri64(SDNode * OpNode,SDValue Addr,SDValue & Base,SDValue & Offset)3676 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3677                                        SDValue &Base, SDValue &Offset) {
3678   return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3679 }
3680 
ChkMemSDNodeAddressSpace(SDNode * N,unsigned int spN) const3681 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3682                                                  unsigned int spN) const {
3683   const Value *Src = nullptr;
3684   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3685     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3686       return true;
3687     Src = mN->getMemOperand()->getValue();
3688   }
3689   if (!Src)
3690     return false;
3691   if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3692     return (PT->getAddressSpace() == spN);
3693   return false;
3694 }
3695 
3696 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3697 /// inline asm expressions.
SelectInlineAsmMemoryOperand(const SDValue & Op,unsigned ConstraintID,std::vector<SDValue> & OutOps)3698 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3699     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3700   SDValue Op0, Op1;
3701   switch (ConstraintID) {
3702   default:
3703     return true;
3704   case InlineAsm::Constraint_m: // memory
3705     if (SelectDirectAddr(Op, Op0)) {
3706       OutOps.push_back(Op0);
3707       OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3708       return false;
3709     }
3710     if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3711       OutOps.push_back(Op0);
3712       OutOps.push_back(Op1);
3713       return false;
3714     }
3715     break;
3716   }
3717   return true;
3718 }
3719 
3720 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3721 /// conversion from \p SrcTy to \p DestTy.
GetConvertOpcode(MVT DestTy,MVT SrcTy,bool IsSigned)3722 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3723                                              bool IsSigned) {
3724   switch (SrcTy.SimpleTy) {
3725   default:
3726     llvm_unreachable("Unhandled source type");
3727   case MVT::i8:
3728     switch (DestTy.SimpleTy) {
3729     default:
3730       llvm_unreachable("Unhandled dest type");
3731     case MVT::i16:
3732       return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3733     case MVT::i32:
3734       return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3735     case MVT::i64:
3736       return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3737     }
3738   case MVT::i16:
3739     switch (DestTy.SimpleTy) {
3740     default:
3741       llvm_unreachable("Unhandled dest type");
3742     case MVT::i8:
3743       return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3744     case MVT::i32:
3745       return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3746     case MVT::i64:
3747       return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3748     }
3749   case MVT::i32:
3750     switch (DestTy.SimpleTy) {
3751     default:
3752       llvm_unreachable("Unhandled dest type");
3753     case MVT::i8:
3754       return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3755     case MVT::i16:
3756       return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3757     case MVT::i64:
3758       return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3759     }
3760   case MVT::i64:
3761     switch (DestTy.SimpleTy) {
3762     default:
3763       llvm_unreachable("Unhandled dest type");
3764     case MVT::i8:
3765       return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3766     case MVT::i16:
3767       return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3768     case MVT::i32:
3769       return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3770     }
3771   }
3772 }
3773