• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the PTX instructions in TableGen format.
11//
12//===----------------------------------------------------------------------===//
13
14include "NVPTXInstrFormats.td"
15
16// A NOP instruction
17let hasSideEffects = 0 in {
18  def NOP : NVPTXInst<(outs), (ins), "", []>;
19}
20
21let OperandType = "OPERAND_IMMEDIATE" in {
22  def f16imm : Operand<f16>;
23}
24
25// List of vector specific properties
26def isVecLD      : VecInstTypeEnum<1>;
27def isVecST      : VecInstTypeEnum<2>;
28def isVecBuild   : VecInstTypeEnum<3>;
29def isVecShuffle : VecInstTypeEnum<4>;
30def isVecExtract : VecInstTypeEnum<5>;
31def isVecInsert  : VecInstTypeEnum<6>;
32def isVecDest    : VecInstTypeEnum<7>;
33def isVecOther   : VecInstTypeEnum<15>;
34
35//===----------------------------------------------------------------------===//
36// NVPTX Operand Definitions.
37//===----------------------------------------------------------------------===//
38
39def brtarget    : Operand<OtherVT>;
40
41// CVT conversion modes
42// These must match the enum in NVPTX.h
43def CvtNONE : PatLeaf<(i32 0x0)>;
44def CvtRNI  : PatLeaf<(i32 0x1)>;
45def CvtRZI  : PatLeaf<(i32 0x2)>;
46def CvtRMI  : PatLeaf<(i32 0x3)>;
47def CvtRPI  : PatLeaf<(i32 0x4)>;
48def CvtRN   : PatLeaf<(i32 0x5)>;
49def CvtRZ   : PatLeaf<(i32 0x6)>;
50def CvtRM   : PatLeaf<(i32 0x7)>;
51def CvtRP   : PatLeaf<(i32 0x8)>;
52
53def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
54def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
55def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
56def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
57def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
58def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
59def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
60def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
61def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
62
63def CvtSAT      : PatLeaf<(i32 0x20)>;
64def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
65
66def CvtMode : Operand<i32> {
67  let PrintMethod = "printCvtMode";
68}
69
70// Compare modes
71// These must match the enum in NVPTX.h
72def CmpEQ   : PatLeaf<(i32 0)>;
73def CmpNE   : PatLeaf<(i32 1)>;
74def CmpLT   : PatLeaf<(i32 2)>;
75def CmpLE   : PatLeaf<(i32 3)>;
76def CmpGT   : PatLeaf<(i32 4)>;
77def CmpGE   : PatLeaf<(i32 5)>;
78def CmpEQU  : PatLeaf<(i32 10)>;
79def CmpNEU  : PatLeaf<(i32 11)>;
80def CmpLTU  : PatLeaf<(i32 12)>;
81def CmpLEU  : PatLeaf<(i32 13)>;
82def CmpGTU  : PatLeaf<(i32 14)>;
83def CmpGEU  : PatLeaf<(i32 15)>;
84def CmpNUM  : PatLeaf<(i32 16)>;
85def CmpNAN  : PatLeaf<(i32 17)>;
86
87def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
88def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
89def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
90def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
91def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
92def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
93def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
94def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
95def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
96def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
97def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
98def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
99def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
100def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
101
102def CmpMode : Operand<i32> {
103  let PrintMethod = "printCmpMode";
104}
105def VecElement : Operand<i32> {
106  let PrintMethod = "printVecElement";
107}
108
109//===----------------------------------------------------------------------===//
110// NVPTX Instruction Predicate Definitions
111//===----------------------------------------------------------------------===//
112
113
114def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
115def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
116def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
117def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
118def hasVote : Predicate<"Subtarget->hasVote()">;
119def hasDouble : Predicate<"Subtarget->hasDouble()">;
120def hasLDG : Predicate<"Subtarget->hasLDG()">;
121def hasLDU : Predicate<"Subtarget->hasLDU()">;
122
123def doF32FTZ : Predicate<"useF32FTZ()">;
124def doNoF32FTZ : Predicate<"!useF32FTZ()">;
125
126def doMulWide      : Predicate<"doMulWide">;
127
128def allowFMA : Predicate<"allowFMA()">;
129def noFMA : Predicate<"!allowFMA()">;
130def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
131
132def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
133def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
134
135def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
136def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
137
138def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
139def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
140
141def true : Predicate<"true">;
142
143def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
144def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
145def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
146
147def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
148def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
149
150def useShortPtr : Predicate<"useShortPointers()">;
151def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
152
153//===----------------------------------------------------------------------===//
154// Some Common Instruction Class Templates
155//===----------------------------------------------------------------------===//
156
157// Template for instructions which take three int64, int32, or int16 args.
158// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
159multiclass I3<string OpcStr, SDNode OpNode> {
160  def i64rr :
161    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
162              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
163              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
164  def i64ri :
165    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
166              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
167              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
168  def i32rr :
169    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
170              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
171              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
172  def i32ri :
173    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
174              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
175              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
176  def i16rr :
177    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
178              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
179              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
180  def i16ri :
181    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
182              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
183              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
184}
185
186// Template for instructions which take 3 int32 args.  The instructions are
187// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
188multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
189   def i32rr :
190     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
191               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
192               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
193   def i32ri :
194     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
195               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
196               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
197}
198
199// Template for instructions which take three fp64 or fp32 args.  The
200// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
201//
202// Also defines ftz (flush subnormal inputs and results to sign-preserving
203// zero) variants for fp32 functions.
204//
205// This multiclass should be used for nodes that cannot be folded into FMAs.
206// For nodes that can be folded into FMAs (i.e. adds and muls), use
207// F3_fma_component.
208multiclass F3<string OpcStr, SDNode OpNode> {
209   def f64rr :
210     NVPTXInst<(outs Float64Regs:$dst),
211               (ins Float64Regs:$a, Float64Regs:$b),
212               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
213               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
214   def f64ri :
215     NVPTXInst<(outs Float64Regs:$dst),
216               (ins Float64Regs:$a, f64imm:$b),
217               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
218               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
219   def f32rr_ftz :
220     NVPTXInst<(outs Float32Regs:$dst),
221               (ins Float32Regs:$a, Float32Regs:$b),
222               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
223               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
224               Requires<[doF32FTZ]>;
225   def f32ri_ftz :
226     NVPTXInst<(outs Float32Regs:$dst),
227               (ins Float32Regs:$a, f32imm:$b),
228               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
229               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
230               Requires<[doF32FTZ]>;
231   def f32rr :
232     NVPTXInst<(outs Float32Regs:$dst),
233               (ins Float32Regs:$a, Float32Regs:$b),
234               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
235               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
236   def f32ri :
237     NVPTXInst<(outs Float32Regs:$dst),
238               (ins Float32Regs:$a, f32imm:$b),
239               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
240               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
241}
242
243// Template for instructions which take three FP args.  The
244// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
245//
246// Also defines ftz (flush subnormal inputs and results to sign-preserving
247// zero) variants for fp32/fp16 functions.
248//
249// This multiclass should be used for nodes that can be folded to make fma ops.
250// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
251// just like the non ".rn" op, but prevents ptxas from creating FMAs.
252multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
253   def f64rr :
254     NVPTXInst<(outs Float64Regs:$dst),
255               (ins Float64Regs:$a, Float64Regs:$b),
256               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
257               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
258               Requires<[allowFMA]>;
259   def f64ri :
260     NVPTXInst<(outs Float64Regs:$dst),
261               (ins Float64Regs:$a, f64imm:$b),
262               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
263               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
264               Requires<[allowFMA]>;
265   def f32rr_ftz :
266     NVPTXInst<(outs Float32Regs:$dst),
267               (ins Float32Regs:$a, Float32Regs:$b),
268               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
269               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
270               Requires<[allowFMA, doF32FTZ]>;
271   def f32ri_ftz :
272     NVPTXInst<(outs Float32Regs:$dst),
273               (ins Float32Regs:$a, f32imm:$b),
274               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
275               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
276               Requires<[allowFMA, doF32FTZ]>;
277   def f32rr :
278     NVPTXInst<(outs Float32Regs:$dst),
279               (ins Float32Regs:$a, Float32Regs:$b),
280               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
281               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
282               Requires<[allowFMA]>;
283   def f32ri :
284     NVPTXInst<(outs Float32Regs:$dst),
285               (ins Float32Regs:$a, f32imm:$b),
286               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
287               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
288               Requires<[allowFMA]>;
289
290   def f16rr_ftz :
291     NVPTXInst<(outs Float16Regs:$dst),
292               (ins Float16Regs:$a, Float16Regs:$b),
293               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
294               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
295               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
296   def f16rr :
297     NVPTXInst<(outs Float16Regs:$dst),
298               (ins Float16Regs:$a, Float16Regs:$b),
299               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
300               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
301               Requires<[useFP16Math, allowFMA]>;
302
303   def f16x2rr_ftz :
304     NVPTXInst<(outs Float16x2Regs:$dst),
305               (ins Float16x2Regs:$a, Float16x2Regs:$b),
306               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
307               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
308               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
309   def f16x2rr :
310     NVPTXInst<(outs Float16x2Regs:$dst),
311               (ins Float16x2Regs:$a, Float16x2Regs:$b),
312               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
313               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
314               Requires<[useFP16Math, allowFMA]>;
315
316   // These have strange names so we don't perturb existing mir tests.
317   def _rnf64rr :
318     NVPTXInst<(outs Float64Regs:$dst),
319               (ins Float64Regs:$a, Float64Regs:$b),
320               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
321               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
322               Requires<[noFMA]>;
323   def _rnf64ri :
324     NVPTXInst<(outs Float64Regs:$dst),
325               (ins Float64Regs:$a, f64imm:$b),
326               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
327               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
328               Requires<[noFMA]>;
329   def _rnf32rr_ftz :
330     NVPTXInst<(outs Float32Regs:$dst),
331               (ins Float32Regs:$a, Float32Regs:$b),
332               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
333               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
334               Requires<[noFMA, doF32FTZ]>;
335   def _rnf32ri_ftz :
336     NVPTXInst<(outs Float32Regs:$dst),
337               (ins Float32Regs:$a, f32imm:$b),
338               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
339               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
340               Requires<[noFMA, doF32FTZ]>;
341   def _rnf32rr :
342     NVPTXInst<(outs Float32Regs:$dst),
343               (ins Float32Regs:$a, Float32Regs:$b),
344               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
345               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
346               Requires<[noFMA]>;
347   def _rnf32ri :
348     NVPTXInst<(outs Float32Regs:$dst),
349               (ins Float32Regs:$a, f32imm:$b),
350               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
351               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
352               Requires<[noFMA]>;
353   def _rnf16rr_ftz :
354     NVPTXInst<(outs Float16Regs:$dst),
355               (ins Float16Regs:$a, Float16Regs:$b),
356               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
357               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
358               Requires<[useFP16Math, noFMA, doF32FTZ]>;
359   def _rnf16rr :
360     NVPTXInst<(outs Float16Regs:$dst),
361               (ins Float16Regs:$a, Float16Regs:$b),
362               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
363               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
364               Requires<[useFP16Math, noFMA]>;
365   def _rnf16x2rr_ftz :
366     NVPTXInst<(outs Float16x2Regs:$dst),
367               (ins Float16x2Regs:$a, Float16x2Regs:$b),
368               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
369               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
370               Requires<[useFP16Math, noFMA, doF32FTZ]>;
371   def _rnf16x2rr :
372     NVPTXInst<(outs Float16x2Regs:$dst),
373               (ins Float16x2Regs:$a, Float16x2Regs:$b),
374               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
375               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
376               Requires<[useFP16Math, noFMA]>;
377}
378
379// Template for operations which take two f32 or f64 operands.  Provides three
380// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
381// subnormal inputs and results to zero).
382multiclass F2<string OpcStr, SDNode OpNode> {
383   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
384                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
385                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
386   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
387                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
388                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
389                           Requires<[doF32FTZ]>;
390   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
391                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
392                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
393}
394
395//===----------------------------------------------------------------------===//
396// NVPTX Instructions.
397//===----------------------------------------------------------------------===//
398
399//-----------------------------------
400// Type Conversion
401//-----------------------------------
402
403let hasSideEffects = 0 in {
404  // Generate a cvt to the given type from all possible types.  Each instance
405  // takes a CvtMode immediate that defines the conversion mode to use.  It can
406  // be CvtNONE to omit a conversion mode.
407  multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
408    def _s8 :
409      NVPTXInst<(outs RC:$dst),
410                (ins Int16Regs:$src, CvtMode:$mode),
411                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
412                FromName, ".s8 \t$dst, $src;"), []>;
413    def _u8 :
414      NVPTXInst<(outs RC:$dst),
415                (ins Int16Regs:$src, CvtMode:$mode),
416                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
417                FromName, ".u8 \t$dst, $src;"), []>;
418    def _s16 :
419      NVPTXInst<(outs RC:$dst),
420                (ins Int16Regs:$src, CvtMode:$mode),
421                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
422                FromName, ".s16 \t$dst, $src;"), []>;
423    def _u16 :
424      NVPTXInst<(outs RC:$dst),
425                (ins Int16Regs:$src, CvtMode:$mode),
426                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
427                FromName, ".u16 \t$dst, $src;"), []>;
428    def _s32 :
429      NVPTXInst<(outs RC:$dst),
430                (ins Int32Regs:$src, CvtMode:$mode),
431                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
432                FromName, ".s32 \t$dst, $src;"), []>;
433    def _u32 :
434      NVPTXInst<(outs RC:$dst),
435                (ins Int32Regs:$src, CvtMode:$mode),
436                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
437                FromName, ".u32 \t$dst, $src;"), []>;
438    def _s64 :
439      NVPTXInst<(outs RC:$dst),
440                (ins Int64Regs:$src, CvtMode:$mode),
441                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
442                FromName, ".s64 \t$dst, $src;"), []>;
443    def _u64 :
444      NVPTXInst<(outs RC:$dst),
445                (ins Int64Regs:$src, CvtMode:$mode),
446                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
447                FromName, ".u64 \t$dst, $src;"), []>;
448    def _f16 :
449      NVPTXInst<(outs RC:$dst),
450                (ins Float16Regs:$src, CvtMode:$mode),
451                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
452                FromName, ".f16 \t$dst, $src;"), []>;
453    def _f32 :
454      NVPTXInst<(outs RC:$dst),
455                (ins Float32Regs:$src, CvtMode:$mode),
456                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
457                FromName, ".f32 \t$dst, $src;"), []>;
458    def _f64 :
459      NVPTXInst<(outs RC:$dst),
460                (ins Float64Regs:$src, CvtMode:$mode),
461                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
462                FromName, ".f64 \t$dst, $src;"), []>;
463  }
464
465  // Generate cvts from all types to all types.
466  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
467  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
468  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
469  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
470  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
471  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
472  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
473  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
474  defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
475  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
476  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
477
478  // These cvts are different from those above: The source and dest registers
479  // are of the same type.
480  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
481                                    "cvt.s16.s8 \t$dst, $src;", []>;
482  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
483                                    "cvt.s32.s8 \t$dst, $src;", []>;
484  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
485                                    "cvt.s32.s16 \t$dst, $src;", []>;
486  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
487                                    "cvt.s64.s8 \t$dst, $src;", []>;
488  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
489                                    "cvt.s64.s16 \t$dst, $src;", []>;
490  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
491                                    "cvt.s64.s32 \t$dst, $src;", []>;
492}
493
494//-----------------------------------
495// Integer Arithmetic
496//-----------------------------------
497
498// Template for xor masquerading as int1 arithmetic.
499multiclass ADD_SUB_i1<SDNode OpNode> {
500   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
501                      "xor.pred \t$dst, $a, $b;",
502                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
503   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
504                      "xor.pred \t$dst, $a, $b;",
505                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
506}
507
508// int1 addition and subtraction are both just xor.
509defm ADD_i1 : ADD_SUB_i1<add>;
510defm SUB_i1 : ADD_SUB_i1<sub>;
511
512// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
513// also use these for unsigned arithmetic.
514defm ADD : I3<"add.s", add>;
515defm SUB : I3<"sub.s", sub>;
516
517// int32 addition and subtraction with carry-out.
518// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
519defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
520defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
521
522// int32 addition and subtraction with carry-in and carry-out.
523defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
524defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
525
526defm MULT : I3<"mul.lo.s", mul>;
527
528defm MULTHS : I3<"mul.hi.s", mulhs>;
529defm MULTHU : I3<"mul.hi.u", mulhu>;
530
531defm SDIV : I3<"div.s", sdiv>;
532defm UDIV : I3<"div.u", udiv>;
533
534// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
535// will lower it.
536defm SREM : I3<"rem.s", srem>;
537defm UREM : I3<"rem.u", urem>;
538
539// Integer absolute value.  NumBits should be one minus the bit width of RC.
540// This idiom implements the algorithm at
541// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
542multiclass ABS<RegisterClass RC, string SizeName> {
543  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
544                  !strconcat("abs", SizeName, " \t$dst, $a;"),
545                  [(set RC:$dst, (abs RC:$a))]>;
546}
547defm ABS_16 : ABS<Int16Regs, ".s16">;
548defm ABS_32 : ABS<Int32Regs, ".s32">;
549defm ABS_64 : ABS<Int64Regs, ".s64">;
550
551// Integer min/max.
552defm SMAX : I3<"max.s", smax>;
553defm UMAX : I3<"max.u", umax>;
554defm SMIN : I3<"min.s", smin>;
555defm UMIN : I3<"min.u", umin>;
556
557//
558// Wide multiplication
559//
560def MULWIDES64 :
561  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
562            "mul.wide.s32 \t$dst, $a, $b;", []>;
563def MULWIDES64Imm :
564  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
565            "mul.wide.s32 \t$dst, $a, $b;", []>;
566def MULWIDES64Imm64 :
567  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
568            "mul.wide.s32 \t$dst, $a, $b;", []>;
569
570def MULWIDEU64 :
571  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
572            "mul.wide.u32 \t$dst, $a, $b;", []>;
573def MULWIDEU64Imm :
574  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
575            "mul.wide.u32 \t$dst, $a, $b;", []>;
576def MULWIDEU64Imm64 :
577  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
578            "mul.wide.u32 \t$dst, $a, $b;", []>;
579
580def MULWIDES32 :
581  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
582            "mul.wide.s16 \t$dst, $a, $b;", []>;
583def MULWIDES32Imm :
584  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
585            "mul.wide.s16 \t$dst, $a, $b;", []>;
586def MULWIDES32Imm32 :
587  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
588            "mul.wide.s16 \t$dst, $a, $b;", []>;
589
590def MULWIDEU32 :
591  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
592            "mul.wide.u16 \t$dst, $a, $b;", []>;
593def MULWIDEU32Imm :
594  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
595            "mul.wide.u16 \t$dst, $a, $b;", []>;
596def MULWIDEU32Imm32 :
597  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
598            "mul.wide.u16 \t$dst, $a, $b;", []>;
599
600def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
601def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
602def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
603
604// Matchers for signed, unsigned mul.wide ISD nodes.
605def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
606          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
607      Requires<[doMulWide]>;
608def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
609          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
610      Requires<[doMulWide]>;
611def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
612          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
613      Requires<[doMulWide]>;
614def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
615          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
616      Requires<[doMulWide]>;
617
618def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
619          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
620      Requires<[doMulWide]>;
621def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
622          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
623      Requires<[doMulWide]>;
624def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
625          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
626      Requires<[doMulWide]>;
627def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
628          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
629      Requires<[doMulWide]>;
630
631// Predicates used for converting some patterns to mul.wide.
632def SInt32Const : PatLeaf<(imm), [{
633  const APInt &v = N->getAPIntValue();
634  return v.isSignedIntN(32);
635}]>;
636
637def UInt32Const : PatLeaf<(imm), [{
638  const APInt &v = N->getAPIntValue();
639  return v.isIntN(32);
640}]>;
641
642def SInt16Const : PatLeaf<(imm), [{
643  const APInt &v = N->getAPIntValue();
644  return v.isSignedIntN(16);
645}]>;
646
647def UInt16Const : PatLeaf<(imm), [{
648  const APInt &v = N->getAPIntValue();
649  return v.isIntN(16);
650}]>;
651
652def Int5Const : PatLeaf<(imm), [{
653  // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
654  const APInt &v = N->getAPIntValue();
655  return v.sge(0) && v.slt(32);
656}]>;
657
658def Int4Const : PatLeaf<(imm), [{
659  // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
660  const APInt &v = N->getAPIntValue();
661  return v.sge(0) && v.slt(16);
662}]>;
663
664def SHL2MUL32 : SDNodeXForm<imm, [{
665  const APInt &v = N->getAPIntValue();
666  APInt temp(32, 1);
667  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
668}]>;
669
670def SHL2MUL16 : SDNodeXForm<imm, [{
671  const APInt &v = N->getAPIntValue();
672  APInt temp(16, 1);
673  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
674}]>;
675
676// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
677def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
678          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
679      Requires<[doMulWide]>;
680def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
681          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
682      Requires<[doMulWide]>;
683
684def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
685          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
686      Requires<[doMulWide]>;
687def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
688          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
689      Requires<[doMulWide]>;
690
691// Convert "sign/zero-extend then multiply" to mul.wide.
692def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
693          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
694      Requires<[doMulWide]>;
695def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
696          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
697      Requires<[doMulWide]>;
698
699def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
700          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
701      Requires<[doMulWide]>;
702def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
703          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
704      Requires<[doMulWide]>;
705
706def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
707          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
708      Requires<[doMulWide]>;
709def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
710          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
711      Requires<[doMulWide]>;
712
713def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
714          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
715      Requires<[doMulWide]>;
716def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
717          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
718      Requires<[doMulWide]>;
719
720//
721// Integer multiply-add
722//
723def SDTIMAD :
724  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
725                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
726def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
727
728def MAD16rrr :
729  NVPTXInst<(outs Int16Regs:$dst),
730            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
731            "mad.lo.s16 \t$dst, $a, $b, $c;",
732            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
733def MAD16rri :
734  NVPTXInst<(outs Int16Regs:$dst),
735            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
736            "mad.lo.s16 \t$dst, $a, $b, $c;",
737            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
738def MAD16rir :
739  NVPTXInst<(outs Int16Regs:$dst),
740            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
741            "mad.lo.s16 \t$dst, $a, $b, $c;",
742            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
743def MAD16rii :
744  NVPTXInst<(outs Int16Regs:$dst),
745            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
746            "mad.lo.s16 \t$dst, $a, $b, $c;",
747            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
748
749def MAD32rrr :
750  NVPTXInst<(outs Int32Regs:$dst),
751            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
752            "mad.lo.s32 \t$dst, $a, $b, $c;",
753            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
754def MAD32rri :
755  NVPTXInst<(outs Int32Regs:$dst),
756            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
757            "mad.lo.s32 \t$dst, $a, $b, $c;",
758            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
759def MAD32rir :
760  NVPTXInst<(outs Int32Regs:$dst),
761            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
762            "mad.lo.s32 \t$dst, $a, $b, $c;",
763            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
764def MAD32rii :
765  NVPTXInst<(outs Int32Regs:$dst),
766            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
767            "mad.lo.s32 \t$dst, $a, $b, $c;",
768            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
769
770def MAD64rrr :
771  NVPTXInst<(outs Int64Regs:$dst),
772            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
773            "mad.lo.s64 \t$dst, $a, $b, $c;",
774            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
775def MAD64rri :
776  NVPTXInst<(outs Int64Regs:$dst),
777            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
778            "mad.lo.s64 \t$dst, $a, $b, $c;",
779            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
780def MAD64rir :
781  NVPTXInst<(outs Int64Regs:$dst),
782            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
783            "mad.lo.s64 \t$dst, $a, $b, $c;",
784            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
785def MAD64rii :
786  NVPTXInst<(outs Int64Regs:$dst),
787            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
788            "mad.lo.s64 \t$dst, $a, $b, $c;",
789            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
790
791def INEG16 :
792  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
793            "neg.s16 \t$dst, $src;",
794            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
795def INEG32 :
796  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
797            "neg.s32 \t$dst, $src;",
798            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
799def INEG64 :
800  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
801            "neg.s64 \t$dst, $src;",
802            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
803
804//-----------------------------------
805// Floating Point Arithmetic
806//-----------------------------------
807
808// Constant 1.0f
809def FloatConst1 : PatLeaf<(fpimm), [{
810  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
811         N->getValueAPF().convertToFloat() == 1.0f;
812}]>;
813// Constant 1.0 (double)
814def DoubleConst1 : PatLeaf<(fpimm), [{
815  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
816         N->getValueAPF().convertToDouble() == 1.0;
817}]>;
818
819// Loads FP16 constant into a register.
820//
821// ptxas does not have hex representation for fp16, so we can't use
822// fp16 immediate values in .f16 instructions. Instead we have to load
823// the constant into a register using mov.b16.
824def LOAD_CONST_F16 :
825  NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
826            "mov.b16 \t$dst, $a;", []>;
827
828defm FADD : F3_fma_component<"add", fadd>;
829defm FSUB : F3_fma_component<"sub", fsub>;
830defm FMUL : F3_fma_component<"mul", fmul>;
831
832defm FMIN : F3<"min", fminnum>;
833defm FMAX : F3<"max", fmaxnum>;
834
835defm FABS  : F2<"abs", fabs>;
836defm FNEG  : F2<"neg", fneg>;
837defm FSQRT : F2<"sqrt.rn", fsqrt>;
838
839//
840// F64 division
841//
842def FDIV641r :
843  NVPTXInst<(outs Float64Regs:$dst),
844            (ins f64imm:$a, Float64Regs:$b),
845            "rcp.rn.f64 \t$dst, $b;",
846            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
847def FDIV64rr :
848  NVPTXInst<(outs Float64Regs:$dst),
849            (ins Float64Regs:$a, Float64Regs:$b),
850            "div.rn.f64 \t$dst, $a, $b;",
851            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
852def FDIV64ri :
853  NVPTXInst<(outs Float64Regs:$dst),
854            (ins Float64Regs:$a, f64imm:$b),
855            "div.rn.f64 \t$dst, $a, $b;",
856            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
857
858//
859// F32 Approximate reciprocal
860//
861def FDIV321r_ftz :
862  NVPTXInst<(outs Float32Regs:$dst),
863            (ins f32imm:$a, Float32Regs:$b),
864            "rcp.approx.ftz.f32 \t$dst, $b;",
865            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
866            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
867def FDIV321r :
868  NVPTXInst<(outs Float32Regs:$dst),
869            (ins f32imm:$a, Float32Regs:$b),
870            "rcp.approx.f32 \t$dst, $b;",
871            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
872            Requires<[do_DIVF32_APPROX]>;
873//
874// F32 Approximate division
875//
876def FDIV32approxrr_ftz :
877  NVPTXInst<(outs Float32Regs:$dst),
878            (ins Float32Regs:$a, Float32Regs:$b),
879            "div.approx.ftz.f32 \t$dst, $a, $b;",
880            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
881            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
882def FDIV32approxri_ftz :
883  NVPTXInst<(outs Float32Regs:$dst),
884            (ins Float32Regs:$a, f32imm:$b),
885            "div.approx.ftz.f32 \t$dst, $a, $b;",
886            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
887            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
888def FDIV32approxrr :
889  NVPTXInst<(outs Float32Regs:$dst),
890            (ins Float32Regs:$a, Float32Regs:$b),
891            "div.approx.f32 \t$dst, $a, $b;",
892            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
893            Requires<[do_DIVF32_APPROX]>;
894def FDIV32approxri :
895  NVPTXInst<(outs Float32Regs:$dst),
896            (ins Float32Regs:$a, f32imm:$b),
897            "div.approx.f32 \t$dst, $a, $b;",
898            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
899            Requires<[do_DIVF32_APPROX]>;
900//
901// F32 Semi-accurate reciprocal
902//
903// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
904//
905def FDIV321r_approx_ftz :
906  NVPTXInst<(outs Float32Regs:$dst),
907            (ins f32imm:$a, Float32Regs:$b),
908            "rcp.approx.ftz.f32 \t$dst, $b;",
909            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
910            Requires<[do_DIVF32_FULL, doF32FTZ]>;
911def FDIV321r_approx :
912  NVPTXInst<(outs Float32Regs:$dst),
913            (ins f32imm:$a, Float32Regs:$b),
914            "rcp.approx.f32 \t$dst, $b;",
915            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
916            Requires<[do_DIVF32_FULL]>;
917//
918// F32 Semi-accurate division
919//
920def FDIV32rr_ftz :
921  NVPTXInst<(outs Float32Regs:$dst),
922            (ins Float32Regs:$a, Float32Regs:$b),
923            "div.full.ftz.f32 \t$dst, $a, $b;",
924            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
925            Requires<[do_DIVF32_FULL, doF32FTZ]>;
926def FDIV32ri_ftz :
927  NVPTXInst<(outs Float32Regs:$dst),
928            (ins Float32Regs:$a, f32imm:$b),
929            "div.full.ftz.f32 \t$dst, $a, $b;",
930            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
931            Requires<[do_DIVF32_FULL, doF32FTZ]>;
932def FDIV32rr :
933  NVPTXInst<(outs Float32Regs:$dst),
934            (ins Float32Regs:$a, Float32Regs:$b),
935            "div.full.f32 \t$dst, $a, $b;",
936            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
937            Requires<[do_DIVF32_FULL]>;
938def FDIV32ri :
939  NVPTXInst<(outs Float32Regs:$dst),
940            (ins Float32Regs:$a, f32imm:$b),
941            "div.full.f32 \t$dst, $a, $b;",
942            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
943            Requires<[do_DIVF32_FULL]>;
944//
945// F32 Accurate reciprocal
946//
947def FDIV321r_prec_ftz :
948  NVPTXInst<(outs Float32Regs:$dst),
949            (ins f32imm:$a, Float32Regs:$b),
950            "rcp.rn.ftz.f32 \t$dst, $b;",
951            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
952            Requires<[doF32FTZ]>;
953def FDIV321r_prec :
954  NVPTXInst<(outs Float32Regs:$dst),
955            (ins f32imm:$a, Float32Regs:$b),
956            "rcp.rn.f32 \t$dst, $b;",
957            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
958//
959// F32 Accurate division
960//
961def FDIV32rr_prec_ftz :
962  NVPTXInst<(outs Float32Regs:$dst),
963            (ins Float32Regs:$a, Float32Regs:$b),
964            "div.rn.ftz.f32 \t$dst, $a, $b;",
965            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
966            Requires<[doF32FTZ]>;
967def FDIV32ri_prec_ftz :
968  NVPTXInst<(outs Float32Regs:$dst),
969            (ins Float32Regs:$a, f32imm:$b),
970            "div.rn.ftz.f32 \t$dst, $a, $b;",
971            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
972            Requires<[doF32FTZ]>;
973def FDIV32rr_prec :
974  NVPTXInst<(outs Float32Regs:$dst),
975            (ins Float32Regs:$a, Float32Regs:$b),
976            "div.rn.f32 \t$dst, $a, $b;",
977            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
978def FDIV32ri_prec :
979  NVPTXInst<(outs Float32Regs:$dst),
980            (ins Float32Regs:$a, f32imm:$b),
981            "div.rn.f32 \t$dst, $a, $b;",
982            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
983
984//
985// FMA
986//
987
988multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
989   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
990                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
991                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
992                       Requires<[Pred]>;
993   def rri : NVPTXInst<(outs RC:$dst),
994                       (ins RC:$a, RC:$b, ImmCls:$c),
995                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
996                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
997                       Requires<[Pred]>;
998   def rir : NVPTXInst<(outs RC:$dst),
999                       (ins RC:$a, ImmCls:$b, RC:$c),
1000                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1001                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1002                       Requires<[Pred]>;
1003   def rii : NVPTXInst<(outs RC:$dst),
1004                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
1005                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1006                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1007                       Requires<[Pred]>;
1008}
1009
1010multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
1011   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1012                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1013                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1014                       Requires<[useFP16Math, Pred]>;
1015}
1016
1017defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
1018defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, true>;
1019defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
1020defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
1021defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1022defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
1023defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
1024
1025// sin/cos
1026def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1027                      "sin.approx.f32 \t$dst, $src;",
1028                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
1029                      Requires<[allowUnsafeFPMath]>;
1030def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1031                      "cos.approx.f32 \t$dst, $src;",
1032                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
1033                      Requires<[allowUnsafeFPMath]>;
1034
1035// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
1036// i.e. "poor man's fmod()"
1037
1038// frem - f32 FTZ
1039def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1040          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
1041            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
1042             Float32Regs:$y))>,
1043          Requires<[doF32FTZ]>;
1044def : Pat<(frem Float32Regs:$x, fpimm:$y),
1045          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
1046            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
1047             fpimm:$y))>,
1048          Requires<[doF32FTZ]>;
1049
1050// frem - f32
1051def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
1052          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
1053            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
1054             Float32Regs:$y))>;
1055def : Pat<(frem Float32Regs:$x, fpimm:$y),
1056          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
1057            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
1058             fpimm:$y))>;
1059
1060// frem - f64
1061def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
1062          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
1063            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
1064             Float64Regs:$y))>;
1065def : Pat<(frem Float64Regs:$x, fpimm:$y),
1066          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
1067            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
1068             fpimm:$y))>;
1069
1070//-----------------------------------
1071// Bitwise operations
1072//-----------------------------------
1073
1074// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
1075// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1076multiclass BITWISE<string OpcStr, SDNode OpNode> {
1077  def b1rr :
1078    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1079              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1080              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
1081  def b1ri :
1082    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1083              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1084              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
1085  def b16rr :
1086    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1087              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1088              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
1089  def b16ri :
1090    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1091              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1092              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
1093  def b32rr :
1094    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1095              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1096              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1097  def b32ri :
1098    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1099              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1100              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
1101  def b64rr :
1102    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1103              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1104              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
1105  def b64ri :
1106    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1107              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1108              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
1109}
1110
1111defm OR  : BITWISE<"or", or>;
1112defm AND : BITWISE<"and", and>;
1113defm XOR : BITWISE<"xor", xor>;
1114
1115def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1116                      "not.pred \t$dst, $src;",
1117                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1118def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1119                      "not.b16 \t$dst, $src;",
1120                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1121def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1122                      "not.b32 \t$dst, $src;",
1123                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
1124def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1125                       "not.b64 \t$dst, $src;",
1126                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1127
1128// Template for left/right shifts.  Takes three operands,
1129//   [dest (reg), src (reg), shift (reg or imm)].
1130// dest and src may be int64, int32, or int16, but shift is always int32.
1131//
1132// This template also defines a 32-bit shift (imm, imm) instruction.
1133multiclass SHIFT<string OpcStr, SDNode OpNode> {
1134   def i64rr :
1135     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1136               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1137               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
1138   def i64ri :
1139     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1140               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1141               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1142   def i32rr :
1143     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1144               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1145               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1146   def i32ri :
1147     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1148               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1149               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
1150   def i32ii :
1151     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1152               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1153               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1154   def i16rr :
1155     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1156               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1157               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
1158   def i16ri :
1159     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1160               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1161               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1162}
1163
1164defm SHL : SHIFT<"shl.b", shl>;
1165defm SRA : SHIFT<"shr.s", sra>;
1166defm SRL : SHIFT<"shr.u", srl>;
1167
1168// Bit-reverse
1169def BREV32 :
1170  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1171             "brev.b32 \t$dst, $a;",
1172             [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
1173def BREV64 :
1174  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1175             "brev.b64 \t$dst, $a;",
1176             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
1177
1178//
1179// Rotate: Use ptx shf instruction if available.
1180//
1181
1182// 32 bit r2 = rotl r1, n
1183//    =>
1184//        r2 = shf.l r1, r1, n
1185def ROTL32imm_hw :
1186  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1187            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1188            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
1189           Requires<[hasHWROT32]>;
1190
1191def ROTL32reg_hw :
1192  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1193            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1194            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1195           Requires<[hasHWROT32]>;
1196
1197// 32 bit r2 = rotr r1, n
1198//    =>
1199//        r2 = shf.r r1, r1, n
1200def ROTR32imm_hw :
1201  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1202            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1203            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
1204           Requires<[hasHWROT32]>;
1205
1206def ROTR32reg_hw :
1207  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1208            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1209            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1210           Requires<[hasHWROT32]>;
1211
1212// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
1213def ROT32imm_sw :
1214  NVPTXInst<(outs Int32Regs:$dst),
1215            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1216            "{{\n\t"
1217            ".reg .b32 %lhs;\n\t"
1218            ".reg .b32 %rhs;\n\t"
1219            "shl.b32 \t%lhs, $src, $amt1;\n\t"
1220            "shr.b32 \t%rhs, $src, $amt2;\n\t"
1221            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1222            "}}",
1223            []>;
1224
1225def SUB_FRM_32 : SDNodeXForm<imm, [{
1226  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1227}]>;
1228
1229def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
1230          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1231      Requires<[noHWROT32]>;
1232def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
1233          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1234      Requires<[noHWROT32]>;
1235
1236// 32-bit software rotate left by register.
1237def ROTL32reg_sw :
1238  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1239            "{{\n\t"
1240            ".reg .b32 %lhs;\n\t"
1241            ".reg .b32 %rhs;\n\t"
1242            ".reg .b32 %amt2;\n\t"
1243            "shl.b32 \t%lhs, $src, $amt;\n\t"
1244            "sub.s32 \t%amt2, 32, $amt;\n\t"
1245            "shr.b32 \t%rhs, $src, %amt2;\n\t"
1246            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1247            "}}",
1248            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1249           Requires<[noHWROT32]>;
1250
1251// 32-bit software rotate right by register.
1252def ROTR32reg_sw :
1253  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1254            "{{\n\t"
1255            ".reg .b32 %lhs;\n\t"
1256            ".reg .b32 %rhs;\n\t"
1257            ".reg .b32 %amt2;\n\t"
1258            "shr.b32 \t%lhs, $src, $amt;\n\t"
1259            "sub.s32 \t%amt2, 32, $amt;\n\t"
1260            "shl.b32 \t%rhs, $src, %amt2;\n\t"
1261            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1262            "}}",
1263            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1264           Requires<[noHWROT32]>;
1265
1266// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
1267def ROT64imm_sw :
1268  NVPTXInst<(outs Int64Regs:$dst),
1269            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1270            "{{\n\t"
1271            ".reg .b64 %lhs;\n\t"
1272            ".reg .b64 %rhs;\n\t"
1273            "shl.b64 \t%lhs, $src, $amt1;\n\t"
1274            "shr.b64 \t%rhs, $src, $amt2;\n\t"
1275            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1276            "}}",
1277            []>;
1278
1279def SUB_FRM_64 : SDNodeXForm<imm, [{
1280    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1281}]>;
1282
1283def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1284          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1285def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1286          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1287
1288// 64-bit software rotate left by register.
1289def ROTL64reg_sw :
1290  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1291            "{{\n\t"
1292            ".reg .b64 %lhs;\n\t"
1293            ".reg .b64 %rhs;\n\t"
1294            ".reg .u32 %amt2;\n\t"
1295            "shl.b64 \t%lhs, $src, $amt;\n\t"
1296            "sub.u32 \t%amt2, 64, $amt;\n\t"
1297            "shr.b64 \t%rhs, $src, %amt2;\n\t"
1298            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1299            "}}",
1300            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
1301
1302def ROTR64reg_sw :
1303  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1304            "{{\n\t"
1305            ".reg .b64 %lhs;\n\t"
1306            ".reg .b64 %rhs;\n\t"
1307            ".reg .u32 %amt2;\n\t"
1308            "shr.b64 \t%lhs, $src, $amt;\n\t"
1309            "sub.u32 \t%amt2, 64, $amt;\n\t"
1310            "shl.b64 \t%rhs, $src, %amt2;\n\t"
1311            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1312            "}}",
1313            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
1314
1315//
1316// Funnnel shift in clamp mode
1317//
1318
1319// Create SDNodes so they can be used in the DAG code, e.g.
1320// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1321def SDTIntShiftDOp :
1322  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1323                       SDTCisInt<0>, SDTCisInt<3>]>;
1324def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1325def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1326
1327def FUNSHFLCLAMP :
1328  NVPTXInst<(outs Int32Regs:$dst),
1329            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1330            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1331            [(set Int32Regs:$dst,
1332              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1333
1334def FUNSHFRCLAMP :
1335  NVPTXInst<(outs Int32Regs:$dst),
1336            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1337            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1338            [(set Int32Regs:$dst,
1339             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1340
1341//
1342// BFE - bit-field extract
1343//
1344
1345// Template for BFE instructions.  Takes four args,
1346//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1347// Start may be an imm only if end is also an imm.  FIXME: Is this a
1348// restriction in PTX?
1349//
1350// dest and src may be int32 or int64, but start and end are always int32.
1351multiclass BFE<string TyStr, RegisterClass RC> {
1352  def rrr
1353    : NVPTXInst<(outs RC:$d),
1354                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1355                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1356  def rri
1357    : NVPTXInst<(outs RC:$d),
1358                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1359                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1360  def rii
1361    : NVPTXInst<(outs RC:$d),
1362                (ins RC:$a, i32imm:$b, i32imm:$c),
1363                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1364}
1365
1366let hasSideEffects = 0 in {
1367  defm BFE_S32 : BFE<"s32", Int32Regs>;
1368  defm BFE_U32 : BFE<"u32", Int32Regs>;
1369  defm BFE_S64 : BFE<"s64", Int64Regs>;
1370  defm BFE_U64 : BFE<"u64", Int64Regs>;
1371}
1372
1373//-----------------------------------
1374// Comparison instructions (setp, set)
1375//-----------------------------------
1376
1377// FIXME: This doesn't cover versions of set and setp that combine with a
1378// boolean predicate, e.g. setp.eq.and.b16.
1379
1380let hasSideEffects = 0 in {
1381  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1382    def rr :
1383      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1384                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1385                           " \t$dst, $a, $b;"), []>;
1386    def ri :
1387      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1388                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1389                           " \t$dst, $a, $b;"), []>;
1390    def ir :
1391      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1392                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1393                           " \t$dst, $a, $b;"), []>;
1394  }
1395}
1396
1397defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1398defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1399defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1400defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1401defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1402defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1403defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1404defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1405defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1406defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1407defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1408def SETP_f16rr :
1409      NVPTXInst<(outs Int1Regs:$dst),
1410                (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
1411                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1412                []>, Requires<[useFP16Math]>;
1413
1414def SETP_f16x2rr :
1415      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1416                (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
1417                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
1418                []>,
1419                Requires<[useFP16Math]>;
1420
1421
1422// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
1423// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1424// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
1425
1426let hasSideEffects = 0 in {
1427  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1428    def rr : NVPTXInst<(outs Int32Regs:$dst),
1429                       (ins RC:$a, RC:$b, CmpMode:$cmp),
1430                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1431    def ri : NVPTXInst<(outs Int32Regs:$dst),
1432                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1433                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1434    def ir : NVPTXInst<(outs Int32Regs:$dst),
1435                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1436                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1437  }
1438}
1439
1440defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1441defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
1442defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
1443defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
1444defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
1445defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
1446defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
1447defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
1448defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
1449defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
1450defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
1451defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
1452
1453//-----------------------------------
1454// Selection instructions (selp)
1455//-----------------------------------
1456
1457// FIXME: Missing slct
1458
1459// selp instructions that don't have any pattern matches; we explicitly use
1460// them within this file.
1461let hasSideEffects = 0 in {
1462  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1463    def rr : NVPTXInst<(outs RC:$dst),
1464                       (ins RC:$a, RC:$b, Int1Regs:$p),
1465                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1466    def ri : NVPTXInst<(outs RC:$dst),
1467                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1468                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1469    def ir : NVPTXInst<(outs RC:$dst),
1470                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1471                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1472    def ii : NVPTXInst<(outs RC:$dst),
1473                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1474                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
1475  }
1476
1477  multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
1478                          SDNode ImmNode> {
1479    def rr :
1480      NVPTXInst<(outs RC:$dst),
1481                (ins RC:$a, RC:$b, Int1Regs:$p),
1482                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1483                [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
1484    def ri :
1485      NVPTXInst<(outs RC:$dst),
1486                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1487                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1488                [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
1489    def ir :
1490      NVPTXInst<(outs RC:$dst),
1491                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1492                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1493                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
1494    def ii :
1495      NVPTXInst<(outs RC:$dst),
1496                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1497                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
1498                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
1499  }
1500}
1501
1502// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
1503// good.
1504defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
1505defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
1506defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
1507defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
1508defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
1509defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
1510defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
1511defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
1512defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
1513defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
1514defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
1515defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
1516
1517def SELP_f16x2rr :
1518    NVPTXInst<(outs Float16x2Regs:$dst),
1519              (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
1520              "selp.b32 \t$dst, $a, $b, $p;",
1521              [(set Float16x2Regs:$dst,
1522                    (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
1523
1524//-----------------------------------
1525// Data Movement (Load / Store, Move)
1526//-----------------------------------
1527
1528def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
1529                            [SDNPWantRoot]>;
1530def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
1531                              [SDNPWantRoot]>;
1532def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
1533
1534def MEMri : Operand<i32> {
1535  let PrintMethod = "printMemOperand";
1536  let MIOperandInfo = (ops Int32Regs, i32imm);
1537}
1538def MEMri64 : Operand<i64> {
1539  let PrintMethod = "printMemOperand";
1540  let MIOperandInfo = (ops Int64Regs, i64imm);
1541}
1542
1543def imem : Operand<iPTR> {
1544  let PrintMethod = "printOperand";
1545}
1546
1547def imemAny : Operand<iPTRAny> {
1548  let PrintMethod = "printOperand";
1549}
1550
1551def LdStCode : Operand<i32> {
1552  let PrintMethod = "printLdStCode";
1553}
1554
1555def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
1556def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
1557
1558// Load a memory address into a u32 or u64 register.
1559def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
1560                         "mov.u32 \t$dst, $a;",
1561                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1562def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
1563                           "mov.u64 \t$dst, $a;",
1564                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1565
1566// Get pointer to local stack.
1567let hasSideEffects = 0 in {
1568  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
1569                                     "mov.u32 \t$d, __local_depot$num;", []>;
1570  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
1571                                    "mov.u64 \t$d, __local_depot$num;", []>;
1572}
1573
1574
1575// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1576let IsSimpleMove=1, hasSideEffects=0 in {
1577  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
1578                           "mov.pred \t$dst, $sss;", []>;
1579  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1580                           "mov.u16 \t$dst, $sss;", []>;
1581  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1582                           "mov.u32 \t$dst, $sss;", []>;
1583  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1584                           "mov.u64 \t$dst, $sss;", []>;
1585
1586  def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
1587                           // We have to use .b16 here as there's no mov.f16.
1588                           "mov.b16 \t$dst, $src;", []>;
1589  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1590                           "mov.f32 \t$dst, $src;", []>;
1591  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
1592                           "mov.f64 \t$dst, $src;", []>;
1593}
1594
1595def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1596                        "mov.pred \t$dst, $src;",
1597                        [(set Int1Regs:$dst, imm:$src)]>;
1598def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1599                         "mov.u16 \t$dst, $src;",
1600                         [(set Int16Regs:$dst, imm:$src)]>;
1601def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1602                         "mov.u32 \t$dst, $src;",
1603                         [(set Int32Regs:$dst, imm:$src)]>;
1604def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
1605                        "mov.u64 \t$dst, $src;",
1606                        [(set Int64Regs:$dst, imm:$src)]>;
1607
1608def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
1609                         "mov.f32 \t$dst, $src;",
1610                         [(set Float32Regs:$dst, fpimm:$src)]>;
1611def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
1612                         "mov.f64 \t$dst, $src;",
1613                         [(set Float64Regs:$dst, fpimm:$src)]>;
1614
1615def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
1616
1617//---- Copy Frame Index ----
1618def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
1619                            "add.u32 \t$dst, ${addr:add};",
1620                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
1621def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
1622                            "add.u64 \t$dst, ${addr:add};",
1623                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
1624
1625//-----------------------------------
1626// Comparison and Selection
1627//-----------------------------------
1628
1629multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
1630                       Instruction setp_16rr,
1631                       Instruction setp_16ri,
1632                       Instruction setp_16ir,
1633                       Instruction setp_32rr,
1634                       Instruction setp_32ri,
1635                       Instruction setp_32ir,
1636                       Instruction setp_64rr,
1637                       Instruction setp_64ri,
1638                       Instruction setp_64ir,
1639                       Instruction set_16rr,
1640                       Instruction set_16ri,
1641                       Instruction set_16ir,
1642                       Instruction set_32rr,
1643                       Instruction set_32ri,
1644                       Instruction set_32ir,
1645                       Instruction set_64rr,
1646                       Instruction set_64ri,
1647                       Instruction set_64ir> {
1648  // i16 -> pred
1649  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
1650            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1651  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
1652            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
1653  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
1654            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
1655  // i32 -> pred
1656  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
1657            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1658  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
1659            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
1660  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
1661            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
1662  // i64 -> pred
1663  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
1664            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1665  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
1666            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
1667  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
1668            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
1669
1670  // i16 -> i32
1671  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
1672            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1673  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
1674            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
1675  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
1676            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
1677  // i32 -> i32
1678  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
1679            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1680  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
1681            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
1682  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
1683            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
1684  // i64 -> i32
1685  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
1686            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1687  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
1688            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
1689  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
1690            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
1691}
1692
1693multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
1694  : ISET_FORMAT<OpNode, Mode,
1695                SETP_s16rr, SETP_s16ri, SETP_s16ir,
1696                SETP_s32rr, SETP_s32ri, SETP_s32ir,
1697                SETP_s64rr, SETP_s64ri, SETP_s64ir,
1698                SET_s16rr, SET_s16ri, SET_s16ir,
1699                SET_s32rr, SET_s32ri, SET_s32ir,
1700                SET_s64rr, SET_s64ri, SET_s64ir> {
1701  // TableGen doesn't like empty multiclasses.
1702  def : PatLeaf<(i32 0)>;
1703}
1704
1705multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
1706  : ISET_FORMAT<OpNode, Mode,
1707                SETP_u16rr, SETP_u16ri, SETP_u16ir,
1708                SETP_u32rr, SETP_u32ri, SETP_u32ir,
1709                SETP_u64rr, SETP_u64ri, SETP_u64ir,
1710                SET_u16rr, SET_u16ri, SET_u16ir,
1711                SET_u32rr, SET_u32ri, SET_u32ir,
1712                SET_u64rr, SET_u64ri, SET_u64ir> {
1713  // TableGen doesn't like empty multiclasses.
1714  def : PatLeaf<(i32 0)>;
1715}
1716
1717defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
1718defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
1719defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
1720defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
1721defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
1722defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
1723defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
1724defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
1725defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
1726defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
1727defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
1728defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
1729
1730// i1 compares
1731def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
1732          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1733def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
1734          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1735
1736def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
1737          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1738def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
1739          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1740
1741// i1 compare -> i32
1742def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1743          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1744def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1745          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1746
1747
1748
1749multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
1750  // f16 -> pred
1751  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
1752            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
1753        Requires<[useFP16Math,doF32FTZ]>;
1754  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
1755            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
1756        Requires<[useFP16Math]>;
1757  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
1758            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
1759        Requires<[useFP16Math,doF32FTZ]>;
1760  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
1761            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
1762        Requires<[useFP16Math]>;
1763  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
1764            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
1765        Requires<[useFP16Math,doF32FTZ]>;
1766  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
1767            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
1768        Requires<[useFP16Math]>;
1769
1770  // f32 -> pred
1771  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1772            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1773        Requires<[doF32FTZ]>;
1774  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1775            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1776  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1777            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1778        Requires<[doF32FTZ]>;
1779  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1780            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1781  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1782            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1783        Requires<[doF32FTZ]>;
1784  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1785            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1786
1787  // f64 -> pred
1788  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
1789            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1790  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
1791            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1792  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
1793            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1794
1795  // f16 -> i32
1796  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
1797            (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
1798        Requires<[useFP16Math, doF32FTZ]>;
1799  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
1800            (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
1801        Requires<[useFP16Math]>;
1802  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
1803            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
1804        Requires<[useFP16Math, doF32FTZ]>;
1805  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
1806            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
1807        Requires<[useFP16Math]>;
1808  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
1809            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
1810        Requires<[useFP16Math, doF32FTZ]>;
1811  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
1812            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
1813        Requires<[useFP16Math]>;
1814
1815  // f32 -> i32
1816  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1817            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1818        Requires<[doF32FTZ]>;
1819  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1820            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1821  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1822            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1823        Requires<[doF32FTZ]>;
1824  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1825            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1826  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1827            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1828        Requires<[doF32FTZ]>;
1829  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1830            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1831
1832  // f64 -> i32
1833  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
1834            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1835  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
1836            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1837  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
1838            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1839}
1840
1841defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
1842defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
1843defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
1844defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
1845defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
1846defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
1847
1848defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
1849defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
1850defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
1851defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
1852defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
1853defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
1854
1855defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
1856defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
1857defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
1858defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
1859defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
1860defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
1861
1862defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
1863defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
1864
1865// FIXME: What is this doing here?  Can it be deleted?
1866// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
1867//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
1868
1869def SDTDeclareParamProfile :
1870  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1871def SDTDeclareScalarParamProfile :
1872  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1873def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
1874def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
1875def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
1876def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1877def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1878def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1879def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
1880def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
1881def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1882def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1883def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
1884def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
1885def SDTCallValProfile : SDTypeProfile<1, 0, []>;
1886def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
1887def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1888def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
1889def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
1890def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
1891
1892def DeclareParam :
1893  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
1894         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1895def DeclareScalarParam :
1896  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
1897         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1898def DeclareRetParam :
1899  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
1900         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1901def DeclareRet :
1902  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
1903         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1904def LoadParam :
1905  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
1906         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1907def LoadParamV2 :
1908  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
1909         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1910def LoadParamV4 :
1911  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
1912         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1913def PrintCall :
1914  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
1915         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1916def PrintConvergentCall :
1917  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
1918         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1919def PrintCallUni :
1920  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
1921         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1922def PrintConvergentCallUni :
1923  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
1924         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1925def StoreParam :
1926  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
1927         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1928def StoreParamV2 :
1929  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
1930         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1931def StoreParamV4 :
1932  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
1933         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1934def StoreParamU32 :
1935  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
1936         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1937def StoreParamS32 :
1938  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
1939         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1940def CallArgBegin :
1941  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
1942         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1943def CallArg :
1944  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
1945         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1946def LastCallArg :
1947  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
1948         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1949def CallArgEnd :
1950  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
1951         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1952def CallVoid :
1953  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
1954         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1955def Prototype :
1956  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
1957         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1958def CallVal :
1959  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
1960         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1961def MoveParam :
1962  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
1963def StoreRetval :
1964  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
1965         [SDNPHasChain, SDNPSideEffect]>;
1966def StoreRetvalV2 :
1967  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
1968         [SDNPHasChain, SDNPSideEffect]>;
1969def StoreRetvalV4 :
1970  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
1971         [SDNPHasChain, SDNPSideEffect]>;
1972def PseudoUseParam :
1973  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
1974         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1975def RETURNNode :
1976  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
1977         [SDNPHasChain, SDNPSideEffect]>;
1978
1979let mayLoad = 1 in {
1980  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
1981        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
1982                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
1983                  []>;
1984
1985  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
1986        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
1987                  !strconcat("ld.param.v2", opstr,
1988                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
1989
1990  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
1991        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
1992                        regclass:$dst4),
1993                  (ins i32imm:$b),
1994                  !strconcat("ld.param.v4", opstr,
1995                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
1996                  []>;
1997}
1998
1999class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2000      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2001                !strconcat("mov", opstr, " \t$dst, retval$b;"),
2002                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2003
2004let mayStore = 1 in {
2005  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
2006        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
2007                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
2008                  []>;
2009
2010  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
2011        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
2012                               i32imm:$a, i32imm:$b),
2013                  !strconcat("st.param.v2", opstr,
2014                             " \t[param$a+$b], {{$val, $val2}};"),
2015                  []>;
2016
2017  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
2018        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
2019                               regclass:$val4, i32imm:$a,
2020                               i32imm:$b),
2021                  !strconcat("st.param.v4", opstr,
2022                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
2023                  []>;
2024
2025  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2026        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
2027                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
2028                  []>;
2029
2030  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2031        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
2032                  !strconcat("st.param.v2", opstr,
2033                             " \t[func_retval0+$a], {{$val, $val2}};"),
2034                  []>;
2035
2036  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2037        NVPTXInst<(outs),
2038                  (ins regclass:$val, regclass:$val2, regclass:$val3,
2039                       regclass:$val4, i32imm:$a),
2040                  !strconcat("st.param.v4", opstr,
2041                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
2042                  []>;
2043}
2044
2045let isCall=1 in {
2046  multiclass CALL<string OpcStr, SDNode OpNode> {
2047     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2048       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2049     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2050       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2051     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2052       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2053     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2054       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2055     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2056       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2057       [(OpNode (i32 4))]>;
2058     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2059       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2060       [(OpNode (i32 5))]>;
2061     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2062       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2063                            "retval5), "),
2064       [(OpNode (i32 6))]>;
2065     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2066       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2067                            "retval5, retval6), "),
2068       [(OpNode (i32 7))]>;
2069     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2070       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2071                            "retval5, retval6, retval7), "),
2072       [(OpNode (i32 8))]>;
2073  }
2074}
2075
2076defm Call : CALL<"call", PrintCall>;
2077defm CallUni : CALL<"call.uni", PrintCallUni>;
2078
2079// Convergent call instructions.  These are identical to regular calls, except
2080// they have the isConvergent bit set.
2081let isConvergent=1 in {
2082  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2083  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2084}
2085
2086def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
2087def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
2088def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
2089def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
2090def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
2091def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
2092def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
2093def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
2094def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
2095def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
2096def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
2097def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
2098def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
2099def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
2100def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
2101def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
2102def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
2103def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
2104def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
2105def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
2106def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
2107def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
2108
2109def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
2110def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
2111
2112def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
2113def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
2114def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
2115def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
2116def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
2117def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
2118
2119def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
2120def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
2121def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
2122
2123def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
2124def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
2125def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
2126def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
2127def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
2128def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
2129def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
2130def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
2131def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
2132def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
2133def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
2134
2135def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
2136def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
2137def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
2138def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
2139def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
2140def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
2141def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
2142def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
2143def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
2144def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
2145def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
2146
2147def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
2148def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
2149def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
2150def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
2151def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
2152def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
2153def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
2154def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
2155def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
2156def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
2157def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
2158
2159def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2160def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2161def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2162def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2163
2164class CallArgInst<NVPTXRegClass regclass> :
2165  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2166            [(CallArg (i32 0), regclass:$a)]>;
2167
2168class LastCallArgInst<NVPTXRegClass regclass> :
2169  NVPTXInst<(outs), (ins regclass:$a), "$a",
2170            [(LastCallArg (i32 0), regclass:$a)]>;
2171
2172def CallArgI64     : CallArgInst<Int64Regs>;
2173def CallArgI32     : CallArgInst<Int32Regs>;
2174def CallArgI16     : CallArgInst<Int16Regs>;
2175def CallArgF64     : CallArgInst<Float64Regs>;
2176def CallArgF32     : CallArgInst<Float32Regs>;
2177
2178def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2179def LastCallArgI32 : LastCallArgInst<Int32Regs>;
2180def LastCallArgI16 : LastCallArgInst<Int16Regs>;
2181def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2182def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2183
2184def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2185                              [(CallArg (i32 0), (i32 imm:$a))]>;
2186def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2187                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
2188
2189def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2190                             [(CallArg (i32 1), (i32 imm:$a))]>;
2191def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2192                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2193
2194def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2195                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2196def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2197                                  [(CallVoid Int32Regs:$addr)]>;
2198def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2199                                  [(CallVoid Int64Regs:$addr)]>;
2200def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2201                                  [(Prototype (i32 imm:$val))]>;
2202
2203def DeclareRetMemInst :
2204  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2205            ".param .align $align .b8 retval$num[$size];",
2206            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2207def DeclareRetScalarInst :
2208  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2209            ".param .b$size retval$num;",
2210            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2211def DeclareRetRegInst :
2212  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2213            ".reg .b$size retval$num;",
2214            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2215
2216def DeclareParamInst :
2217  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2218            ".param .align $align .b8 param$a[$size];",
2219            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2220def DeclareScalarParamInst :
2221  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2222            ".param .b$size param$a;",
2223            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2224def DeclareScalarRegInst :
2225  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2226            ".reg .b$size param$a;",
2227            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2228
2229class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
2230  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2231            !strconcat("mov", asmstr, " \t$dst, $src;"),
2232            [(set regclass:$dst, (MoveParam regclass:$src))]>;
2233
2234def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
2235def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
2236def MoveParamI16 :
2237  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2238            "cvt.u16.u32 \t$dst, $src;",
2239            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
2240def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
2241def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
2242def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
2243
2244class PseudoUseParamInst<NVPTXRegClass regclass> :
2245  NVPTXInst<(outs), (ins regclass:$src),
2246            "// Pseudo use of $src",
2247            [(PseudoUseParam regclass:$src)]>;
2248
2249def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
2250def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
2251def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
2252def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
2253def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
2254
2255
2256//
2257// Load / Store Handling
2258//
2259multiclass LD<NVPTXRegClass regclass> {
2260  def _avar : NVPTXInst<
2261    (outs regclass:$dst),
2262    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2263         i32imm:$fromWidth, imem:$addr),
2264    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2265    "\t$dst, [$addr];", []>;
2266  def _areg : NVPTXInst<
2267    (outs regclass:$dst),
2268    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2269         i32imm:$fromWidth, Int32Regs:$addr),
2270    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2271    "\t$dst, [$addr];", []>;
2272  def _areg_64 : NVPTXInst<
2273    (outs regclass:$dst),
2274    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2275         i32imm:$fromWidth, Int64Regs:$addr),
2276    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2277    "\t$dst, [$addr];", []>;
2278  def _ari : NVPTXInst<
2279    (outs regclass:$dst),
2280    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2281         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2282    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2283    "\t$dst, [$addr+$offset];", []>;
2284  def _ari_64 : NVPTXInst<
2285    (outs regclass:$dst),
2286    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2287         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2288    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2289    "\t$dst, [$addr+$offset];", []>;
2290  def _asi : NVPTXInst<
2291    (outs regclass:$dst),
2292    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2293         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2294    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2295    "\t$dst, [$addr+$offset];", []>;
2296}
2297
2298let mayLoad=1, hasSideEffects=0 in {
2299  defm LD_i8  : LD<Int16Regs>;
2300  defm LD_i16 : LD<Int16Regs>;
2301  defm LD_i32 : LD<Int32Regs>;
2302  defm LD_i64 : LD<Int64Regs>;
2303  defm LD_f16 : LD<Float16Regs>;
2304  defm LD_f16x2 : LD<Float16x2Regs>;
2305  defm LD_f32 : LD<Float32Regs>;
2306  defm LD_f64 : LD<Float64Regs>;
2307}
2308
2309multiclass ST<NVPTXRegClass regclass> {
2310  def _avar : NVPTXInst<
2311    (outs),
2312    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2313         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2314    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2315    " \t[$addr], $src;", []>;
2316  def _areg : NVPTXInst<
2317    (outs),
2318    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2319         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2320    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2321    " \t[$addr], $src;", []>;
2322  def _areg_64 : NVPTXInst<
2323    (outs),
2324    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2325         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2326    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2327    " \t[$addr], $src;", []>;
2328  def _ari : NVPTXInst<
2329    (outs),
2330    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2331         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2332    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2333    " \t[$addr+$offset], $src;", []>;
2334  def _ari_64 : NVPTXInst<
2335    (outs),
2336    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2337         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2338    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2339    " \t[$addr+$offset], $src;", []>;
2340  def _asi : NVPTXInst<
2341    (outs),
2342    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2343         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2344    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2345    " \t[$addr+$offset], $src;", []>;
2346}
2347
2348let mayStore=1, hasSideEffects=0 in {
2349  defm ST_i8  : ST<Int16Regs>;
2350  defm ST_i16 : ST<Int16Regs>;
2351  defm ST_i32 : ST<Int32Regs>;
2352  defm ST_i64 : ST<Int64Regs>;
2353  defm ST_f16 : ST<Float16Regs>;
2354  defm ST_f16x2 : ST<Float16x2Regs>;
2355  defm ST_f32 : ST<Float32Regs>;
2356  defm ST_f64 : ST<Float64Regs>;
2357}
2358
2359// The following is used only in and after vector elementizations.  Vector
2360// elementization happens at the machine instruction level, so the following
2361// instructions never appear in the DAG.
2362multiclass LD_VEC<NVPTXRegClass regclass> {
2363  def _v2_avar : NVPTXInst<
2364    (outs regclass:$dst1, regclass:$dst2),
2365    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2366         i32imm:$fromWidth, imem:$addr),
2367    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2368    "\t{{$dst1, $dst2}}, [$addr];", []>;
2369  def _v2_areg : NVPTXInst<
2370    (outs regclass:$dst1, regclass:$dst2),
2371    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2372         i32imm:$fromWidth, Int32Regs:$addr),
2373    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2374    "\t{{$dst1, $dst2}}, [$addr];", []>;
2375  def _v2_areg_64 : NVPTXInst<
2376    (outs regclass:$dst1, regclass:$dst2),
2377    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2378         i32imm:$fromWidth, Int64Regs:$addr),
2379    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2380    "\t{{$dst1, $dst2}}, [$addr];", []>;
2381  def _v2_ari : NVPTXInst<
2382    (outs regclass:$dst1, regclass:$dst2),
2383    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2384         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2385    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2386    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2387  def _v2_ari_64 : NVPTXInst<
2388    (outs regclass:$dst1, regclass:$dst2),
2389    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2390         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2391    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2392    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2393  def _v2_asi : NVPTXInst<
2394    (outs regclass:$dst1, regclass:$dst2),
2395    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2396         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2397    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2398    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2399  def _v4_avar : NVPTXInst<
2400    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2401    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2402         i32imm:$fromWidth, imem:$addr),
2403    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2404    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2405  def _v4_areg : NVPTXInst<
2406    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2407    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2408         i32imm:$fromWidth, Int32Regs:$addr),
2409    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2410    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2411  def _v4_areg_64 : NVPTXInst<
2412    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2413    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2414         i32imm:$fromWidth, Int64Regs:$addr),
2415    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2416    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2417  def _v4_ari : NVPTXInst<
2418    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2419    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2420         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2421    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2422    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2423  def _v4_ari_64 : NVPTXInst<
2424    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2425    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2426         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2427    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2428    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2429  def _v4_asi : NVPTXInst<
2430    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2431    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2432         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2433    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2434    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2435}
2436let mayLoad=1, hasSideEffects=0 in {
2437  defm LDV_i8  : LD_VEC<Int16Regs>;
2438  defm LDV_i16 : LD_VEC<Int16Regs>;
2439  defm LDV_i32 : LD_VEC<Int32Regs>;
2440  defm LDV_i64 : LD_VEC<Int64Regs>;
2441  defm LDV_f16 : LD_VEC<Float16Regs>;
2442  defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
2443  defm LDV_f32 : LD_VEC<Float32Regs>;
2444  defm LDV_f64 : LD_VEC<Float64Regs>;
2445}
2446
2447multiclass ST_VEC<NVPTXRegClass regclass> {
2448  def _v2_avar : NVPTXInst<
2449    (outs),
2450    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2451         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2452    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2453    "\t[$addr], {{$src1, $src2}};", []>;
2454  def _v2_areg : NVPTXInst<
2455    (outs),
2456    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2457         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2458    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2459    "\t[$addr], {{$src1, $src2}};", []>;
2460  def _v2_areg_64 : NVPTXInst<
2461    (outs),
2462    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2463         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
2464    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2465    "\t[$addr], {{$src1, $src2}};", []>;
2466  def _v2_ari : NVPTXInst<
2467    (outs),
2468    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2469         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
2470         i32imm:$offset),
2471    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2472    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2473  def _v2_ari_64 : NVPTXInst<
2474    (outs),
2475    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2476         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
2477         i32imm:$offset),
2478    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2479    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2480  def _v2_asi : NVPTXInst<
2481    (outs),
2482    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2483         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
2484         i32imm:$offset),
2485    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2486    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2487  def _v4_avar : NVPTXInst<
2488    (outs),
2489    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2490         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2491         i32imm:$fromWidth, imem:$addr),
2492    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2493    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2494  def _v4_areg : NVPTXInst<
2495    (outs),
2496    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2497         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2498         i32imm:$fromWidth, Int32Regs:$addr),
2499    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2500    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2501  def _v4_areg_64 : NVPTXInst<
2502    (outs),
2503    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2504         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2505         i32imm:$fromWidth, Int64Regs:$addr),
2506    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2507    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2508  def _v4_ari : NVPTXInst<
2509    (outs),
2510    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2511         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2512         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2513    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2514    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2515  def _v4_ari_64 : NVPTXInst<
2516    (outs),
2517    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2518         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2519         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2520    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2521    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2522  def _v4_asi : NVPTXInst<
2523    (outs),
2524    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2525         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2526         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2527    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
2528    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2529}
2530
2531let mayStore=1, hasSideEffects=0 in {
2532  defm STV_i8  : ST_VEC<Int16Regs>;
2533  defm STV_i16 : ST_VEC<Int16Regs>;
2534  defm STV_i32 : ST_VEC<Int32Regs>;
2535  defm STV_i64 : ST_VEC<Int64Regs>;
2536  defm STV_f16 : ST_VEC<Float16Regs>;
2537  defm STV_f16x2 : ST_VEC<Float16x2Regs>;
2538  defm STV_f32 : ST_VEC<Float32Regs>;
2539  defm STV_f64 : ST_VEC<Float64Regs>;
2540}
2541
2542//---- Conversion ----
2543
2544class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
2545  NVPTXRegClass regclassOut> :
2546           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
2547           !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")),
2548     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
2549
2550def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
2551def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
2552def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
2553def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
2554def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
2555def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
2556def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
2557def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
2558
2559// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
2560// we cannot specify floating-point literals in isel patterns.  Therefore, we
2561// use an integer selp to select either 1 or 0 and then cvt to floating-point.
2562
2563// sint -> f16
2564def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
2565          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2566def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
2567          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
2568def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
2569          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
2570def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
2571          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
2572
2573// uint -> f16
2574def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
2575          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2576def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
2577          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
2578def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
2579          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
2580def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
2581          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
2582
2583// sint -> f32
2584def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
2585          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2586def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
2587          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
2588def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
2589          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
2590def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
2591          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
2592
2593// uint -> f32
2594def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
2595          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2596def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
2597          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
2598def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
2599          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
2600def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
2601          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
2602
2603// sint -> f64
2604def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
2605          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2606def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
2607          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
2608def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
2609          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
2610def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
2611          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
2612
2613// uint -> f64
2614def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
2615          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2616def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
2617          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
2618def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
2619          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
2620def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
2621          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
2622
2623
2624// f16 -> sint
2625def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
2626          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
2627def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
2628          (CVT_s16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2629def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
2630          (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
2631def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
2632          (CVT_s32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2633def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
2634          (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
2635def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
2636          (CVT_s64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2637def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
2638          (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
2639
2640// f16 -> uint
2641def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
2642          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
2643def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
2644          (CVT_u16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2645def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
2646          (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
2647def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
2648          (CVT_u32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2649def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
2650          (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
2651def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
2652          (CVT_u64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2653def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
2654          (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
2655
2656// f32 -> sint
2657def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
2658          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2659def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2660          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2661def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2662          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
2663def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2664          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2665def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2666          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
2667def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2668          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2669def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2670          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
2671
2672// f32 -> uint
2673def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
2674          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2675def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2676          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2677def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2678          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
2679def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2680          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2681def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2682          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
2683def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2684          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2685def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2686          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
2687
2688// f64 -> sint
2689def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
2690          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2691def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
2692          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
2693def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
2694          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
2695def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
2696          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
2697
2698// f64 -> uint
2699def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
2700          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2701def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
2702          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
2703def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
2704          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
2705def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
2706          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
2707
2708// sext i1
2709def : Pat<(i16 (sext Int1Regs:$a)),
2710          (SELP_s16ii -1, 0, Int1Regs:$a)>;
2711def : Pat<(i32 (sext Int1Regs:$a)),
2712          (SELP_s32ii -1, 0, Int1Regs:$a)>;
2713def : Pat<(i64 (sext Int1Regs:$a)),
2714          (SELP_s64ii -1, 0, Int1Regs:$a)>;
2715
2716// zext i1
2717def : Pat<(i16 (zext Int1Regs:$a)),
2718          (SELP_u16ii 1, 0, Int1Regs:$a)>;
2719def : Pat<(i32 (zext Int1Regs:$a)),
2720          (SELP_u32ii 1, 0, Int1Regs:$a)>;
2721def : Pat<(i64 (zext Int1Regs:$a)),
2722          (SELP_u64ii 1, 0, Int1Regs:$a)>;
2723
2724// anyext i1
2725def : Pat<(i16 (anyext Int1Regs:$a)),
2726          (SELP_u16ii -1, 0, Int1Regs:$a)>;
2727def : Pat<(i32 (anyext Int1Regs:$a)),
2728          (SELP_u32ii -1, 0, Int1Regs:$a)>;
2729def : Pat<(i64 (anyext Int1Regs:$a)),
2730          (SELP_u64ii -1, 0, Int1Regs:$a)>;
2731
2732// sext i16
2733def : Pat<(i32 (sext Int16Regs:$a)),
2734          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
2735def : Pat<(i64 (sext Int16Regs:$a)),
2736          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
2737
2738// zext i16
2739def : Pat<(i32 (zext Int16Regs:$a)),
2740          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2741def : Pat<(i64 (zext Int16Regs:$a)),
2742          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2743
2744// anyext i16
2745def : Pat<(i32 (anyext Int16Regs:$a)),
2746          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2747def : Pat<(i64 (anyext Int16Regs:$a)),
2748          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2749
2750// sext i32
2751def : Pat<(i64 (sext Int32Regs:$a)),
2752          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
2753
2754// zext i32
2755def : Pat<(i64 (zext Int32Regs:$a)),
2756          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2757
2758// anyext i32
2759def : Pat<(i64 (anyext Int32Regs:$a)),
2760          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2761
2762
2763// truncate i64
2764def : Pat<(i32 (trunc Int64Regs:$a)),
2765          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
2766def : Pat<(i16 (trunc Int64Regs:$a)),
2767          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
2768def : Pat<(i1 (trunc Int64Regs:$a)),
2769          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
2770
2771// truncate i32
2772def : Pat<(i16 (trunc Int32Regs:$a)),
2773          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
2774def : Pat<(i1 (trunc Int32Regs:$a)),
2775          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
2776
2777// truncate i16
2778def : Pat<(i1 (trunc Int16Regs:$a)),
2779          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
2780
2781// sext_inreg
2782def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
2783def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
2784def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
2785def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
2786def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
2787def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
2788
2789
2790// Select instructions with 32-bit predicates
2791def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
2792          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
2793          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2794def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
2795          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
2796          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2797def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
2798          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
2799          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2800def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
2801          (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
2802          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2803def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
2804          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
2805          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2806def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
2807          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
2808          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2809
2810
2811let hasSideEffects = 0 in {
2812  // pack a set of smaller int registers to a larger int register
2813  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
2814                             (ins Int16Regs:$s1, Int16Regs:$s2,
2815                                  Int16Regs:$s3, Int16Regs:$s4),
2816                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
2817  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
2818                             (ins Int16Regs:$s1, Int16Regs:$s2),
2819                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
2820  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
2821                             (ins Int32Regs:$s1, Int32Regs:$s2),
2822                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
2823  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
2824                             (ins Float32Regs:$s1, Float32Regs:$s2),
2825                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
2826
2827  // unpack a larger int register to a set of smaller int registers
2828  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
2829                                   Int16Regs:$d3, Int16Regs:$d4),
2830                             (ins Int64Regs:$s),
2831                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
2832  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
2833                             (ins Int32Regs:$s),
2834                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
2835  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
2836                             (ins Int64Regs:$s),
2837                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
2838  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
2839                             (ins Float64Regs:$s),
2840                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
2841
2842}
2843
2844let hasSideEffects = 0 in {
2845  // Extract element of f16x2 register. PTX does not provide any way
2846  // to access elements of f16x2 vector directly, so we need to
2847  // extract it using a temporary register.
2848  def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
2849                               (ins Float16x2Regs:$src),
2850                               "{{ .reg .b16 \t%tmp_hi;\n\t"
2851                               "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
2852                               [(set Float16Regs:$dst,
2853                                 (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
2854  def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
2855                               (ins Float16x2Regs:$src),
2856                               "{{ .reg .b16 \t%tmp_lo;\n\t"
2857                               "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
2858                               [(set Float16Regs:$dst,
2859                                 (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
2860
2861  // Coalesce two f16 registers into f16x2
2862  def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
2863                             (ins Float16Regs:$a, Float16Regs:$b),
2864                             "mov.b32 \t$dst, {{$a, $b}};",
2865                             [(set Float16x2Regs:$dst,
2866                               (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
2867
2868  // Directly initializing underlying the b32 register is one less SASS
2869  // instruction than than vector-packing move.
2870  def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
2871                              "mov.b32 \t$dst, $src;",
2872                              []>;
2873
2874  // Split f16x2 into two f16 registers.
2875  def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
2876                              (ins Float16x2Regs:$src),
2877                              "mov.b32 \t{{$lo, $hi}}, $src;",
2878                              []>;
2879  // Split an i32 into two f16
2880  def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
2881                                   (ins Int32Regs:$src),
2882                                   "mov.b32 \t{{$lo, $hi}}, $src;",
2883                                   []>;
2884}
2885
2886// Count leading zeros
2887let hasSideEffects = 0 in {
2888  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2889                         "clz.b32 \t$d, $a;", []>;
2890  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2891                         "clz.b64 \t$d, $a;", []>;
2892}
2893
2894// 32-bit has a direct PTX instruction
2895def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
2896
2897// The return type of the ctlz ISD node is the same as its input, but the PTX
2898// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
2899// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
2900// truncating back down to 32 bits.
2901def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
2902def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
2903
2904// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
2905// result back to 16-bits if necessary.  We also need to subtract 16 because
2906// the high-order 16 zeros were counted.
2907//
2908// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
2909// use to save one SASS instruction (on sm_35 anyway):
2910//
2911//   mov.b32 $tmp, {0xffff, $a}
2912//   ctlz.b32 $result, $tmp
2913//
2914// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
2915// and then ctlz that value.  This way we don't have to subtract 16 from the
2916// result.  Unfortunately today we don't have a way to generate
2917// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
2918def : Pat<(ctlz Int16Regs:$a),
2919          (SUBi16ri (CVT_u16_u32
2920           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
2921def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
2922          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
2923
2924// Population count
2925let hasSideEffects = 0 in {
2926  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2927                          "popc.b32 \t$d, $a;", []>;
2928  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2929                          "popc.b64 \t$d, $a;", []>;
2930}
2931
2932// 32-bit has a direct PTX instruction
2933def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
2934
2935// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
2936// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
2937// pattern that avoids the type conversion if we're truncating the result to
2938// i32 anyway.
2939def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
2940def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
2941
2942// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
2943// If we know that we're storing into an i32, we can avoid the final trunc.
2944def : Pat<(ctpop Int16Regs:$a),
2945          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
2946def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
2947          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
2948
2949// fpround f32 -> f16
2950def : Pat<(f16 (fpround Float32Regs:$a)),
2951          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
2952def : Pat<(f16 (fpround Float32Regs:$a)),
2953          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
2954
2955// fpround f64 -> f16
2956def : Pat<(f16 (fpround Float64Regs:$a)),
2957          (CVT_f16_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
2958def : Pat<(f16 (fpround Float64Regs:$a)),
2959          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
2960
2961// fpround f64 -> f32
2962def : Pat<(f32 (fpround Float64Regs:$a)),
2963          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
2964def : Pat<(f32 (fpround Float64Regs:$a)),
2965          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
2966
2967// fpextend f16 -> f32
2968def : Pat<(f32 (fpextend Float16Regs:$a)),
2969          (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
2970def : Pat<(f32 (fpextend Float16Regs:$a)),
2971          (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
2972
2973// fpextend f16 -> f64
2974def : Pat<(f64 (fpextend Float16Regs:$a)),
2975          (CVT_f64_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
2976def : Pat<(f64 (fpextend Float16Regs:$a)),
2977          (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
2978
2979// fpextend f32 -> f64
2980def : Pat<(f64 (fpextend Float32Regs:$a)),
2981          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
2982def : Pat<(f64 (fpextend Float32Regs:$a)),
2983          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
2984
2985def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
2986                     [SDNPHasChain, SDNPOptInGlue]>;
2987
2988// fceil, ffloor, fround, ftrunc.
2989
2990def : Pat<(fceil Float16Regs:$a),
2991          (CVT_f16_f16 Float16Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
2992def : Pat<(fceil Float16Regs:$a),
2993          (CVT_f16_f16 Float16Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
2994def : Pat<(fceil Float32Regs:$a),
2995          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
2996def : Pat<(fceil Float32Regs:$a),
2997          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
2998def : Pat<(fceil Float64Regs:$a),
2999          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
3000
3001def : Pat<(ffloor Float16Regs:$a),
3002          (CVT_f16_f16 Float16Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
3003def : Pat<(ffloor Float16Regs:$a),
3004          (CVT_f16_f16 Float16Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
3005def : Pat<(ffloor Float32Regs:$a),
3006          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
3007def : Pat<(ffloor Float32Regs:$a),
3008          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
3009def : Pat<(ffloor Float64Regs:$a),
3010          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
3011
3012def : Pat<(fround Float16Regs:$a),
3013          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3014def : Pat<(f16 (fround Float16Regs:$a)),
3015          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3016def : Pat<(fround Float32Regs:$a),
3017          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3018def : Pat<(f32 (fround Float32Regs:$a)),
3019          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3020def : Pat<(f64 (fround Float64Regs:$a)),
3021          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
3022
3023def : Pat<(ftrunc Float16Regs:$a),
3024          (CVT_f16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3025def : Pat<(ftrunc Float16Regs:$a),
3026          (CVT_f16_f16 Float16Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
3027def : Pat<(ftrunc Float32Regs:$a),
3028          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3029def : Pat<(ftrunc Float32Regs:$a),
3030          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
3031def : Pat<(ftrunc Float64Regs:$a),
3032          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
3033
3034// nearbyint and rint are implemented as rounding to nearest even.  This isn't
3035// strictly correct, because it causes us to ignore the rounding mode.  But it
3036// matches what CUDA's "libm" does.
3037
3038def : Pat<(fnearbyint Float16Regs:$a),
3039          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3040def : Pat<(fnearbyint Float16Regs:$a),
3041          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3042def : Pat<(fnearbyint Float32Regs:$a),
3043          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3044def : Pat<(fnearbyint Float32Regs:$a),
3045          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3046def : Pat<(fnearbyint Float64Regs:$a),
3047          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
3048
3049def : Pat<(frint Float16Regs:$a),
3050          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3051def : Pat<(frint Float16Regs:$a),
3052          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3053def : Pat<(frint Float32Regs:$a),
3054          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
3055def : Pat<(frint Float32Regs:$a),
3056          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
3057def : Pat<(frint Float64Regs:$a),
3058          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
3059
3060
3061//-----------------------------------
3062// Control-flow
3063//-----------------------------------
3064
3065let isTerminator=1 in {
3066   let isReturn=1, isBarrier=1 in
3067      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
3068
3069   let isBranch=1 in
3070      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3071                              "@$a bra \t$target;",
3072                              [(brcond Int1Regs:$a, bb:$target)]>;
3073   let isBranch=1 in
3074      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3075                                   "@!$a bra \t$target;", []>;
3076
3077   let isBranch=1, isBarrier=1 in
3078      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3079                           "bra.uni \t$target;", [(br bb:$target)]>;
3080}
3081
3082def : Pat<(brcond Int32Regs:$a, bb:$target),
3083          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
3084
3085// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3086// conditional branch if the target block is the next block so that the code
3087// can fall through to the target block.  The invertion is done by 'xor
3088// condition, 1', which will be translated to (setne condition, -1).  Since ptx
3089// supports '@!pred bra target', we should use it.
3090def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
3091          (CBranchOther Int1Regs:$a, bb:$target)>;
3092
3093// Call
3094def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3095                                            SDTCisVT<1, i32>]>;
3096def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3097
3098def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3099                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3100def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3101                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3102                            SDNPSideEffect]>;
3103
3104def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3105def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3106                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3107def calltarget : Operand<i32>;
3108let isCall=1 in {
3109   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3110}
3111
3112def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3113def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3114
3115// Pseudo instructions.
3116class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3117   : NVPTXInst<outs, ins, asmstr, pattern>;
3118
3119def Callseq_Start :
3120  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3121            "\\{ // callseq $amt1, $amt2\n"
3122            "\t.reg .b32 temp_param_reg;",
3123            [(callseq_start timm:$amt1, timm:$amt2)]>;
3124def Callseq_End :
3125  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3126            "\\} // callseq $amt1",
3127            [(callseq_end timm:$amt1, timm:$amt2)]>;
3128
3129// trap instruction
3130def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
3131
3132// Call prototype wrapper
3133def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3134def CallPrototype :
3135  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3136         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3137def ProtoIdent : Operand<i32> {
3138  let PrintMethod = "printProtoIdent";
3139}
3140def CALL_PROTOTYPE :
3141  NVPTXInst<(outs), (ins ProtoIdent:$ident),
3142            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3143
3144
3145include "NVPTXIntrinsics.td"
3146
3147
3148//-----------------------------------
3149// Notes
3150//-----------------------------------
3151// BSWAP is currently expanded. The following is a more efficient
3152// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3153// - for sm_20, use pmpt (use vector scalar mov to get the pack and
3154//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
3155// register.
3156