• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the PTX instructions in TableGen format.
11//
12//===----------------------------------------------------------------------===//
13
14include "NVPTXInstrFormats.td"
15
16// A NOP instruction
17let hasSideEffects = 0 in {
18  def NOP : NVPTXInst<(outs), (ins), "", []>;
19}
20
21// List of vector specific properties
22def isVecLD      : VecInstTypeEnum<1>;
23def isVecST      : VecInstTypeEnum<2>;
24def isVecBuild   : VecInstTypeEnum<3>;
25def isVecShuffle : VecInstTypeEnum<4>;
26def isVecExtract : VecInstTypeEnum<5>;
27def isVecInsert  : VecInstTypeEnum<6>;
28def isVecDest    : VecInstTypeEnum<7>;
29def isVecOther   : VecInstTypeEnum<15>;
30
31//===----------------------------------------------------------------------===//
32// NVPTX Operand Definitions.
33//===----------------------------------------------------------------------===//
34
35def brtarget    : Operand<OtherVT>;
36
37// CVT conversion modes
38// These must match the enum in NVPTX.h
39def CvtNONE : PatLeaf<(i32 0x0)>;
40def CvtRNI  : PatLeaf<(i32 0x1)>;
41def CvtRZI  : PatLeaf<(i32 0x2)>;
42def CvtRMI  : PatLeaf<(i32 0x3)>;
43def CvtRPI  : PatLeaf<(i32 0x4)>;
44def CvtRN   : PatLeaf<(i32 0x5)>;
45def CvtRZ   : PatLeaf<(i32 0x6)>;
46def CvtRM   : PatLeaf<(i32 0x7)>;
47def CvtRP   : PatLeaf<(i32 0x8)>;
48
49def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
50def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
51def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
52def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
53def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
54def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
55def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
56def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
57def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
58
59def CvtSAT      : PatLeaf<(i32 0x20)>;
60def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
61
62def CvtMode : Operand<i32> {
63  let PrintMethod = "printCvtMode";
64}
65
66// Compare modes
67// These must match the enum in NVPTX.h
68def CmpEQ   : PatLeaf<(i32 0)>;
69def CmpNE   : PatLeaf<(i32 1)>;
70def CmpLT   : PatLeaf<(i32 2)>;
71def CmpLE   : PatLeaf<(i32 3)>;
72def CmpGT   : PatLeaf<(i32 4)>;
73def CmpGE   : PatLeaf<(i32 5)>;
74def CmpLO   : PatLeaf<(i32 6)>;
75def CmpLS   : PatLeaf<(i32 7)>;
76def CmpHI   : PatLeaf<(i32 8)>;
77def CmpHS   : PatLeaf<(i32 9)>;
78def CmpEQU  : PatLeaf<(i32 10)>;
79def CmpNEU  : PatLeaf<(i32 11)>;
80def CmpLTU  : PatLeaf<(i32 12)>;
81def CmpLEU  : PatLeaf<(i32 13)>;
82def CmpGTU  : PatLeaf<(i32 14)>;
83def CmpGEU  : PatLeaf<(i32 15)>;
84def CmpNUM  : PatLeaf<(i32 16)>;
85def CmpNAN  : PatLeaf<(i32 17)>;
86
87def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
88def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
89def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
90def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
91def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
92def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
93def CmpLO_FTZ   : PatLeaf<(i32 0x106)>;
94def CmpLS_FTZ   : PatLeaf<(i32 0x107)>;
95def CmpHI_FTZ   : PatLeaf<(i32 0x108)>;
96def CmpHS_FTZ   : PatLeaf<(i32 0x109)>;
97def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
98def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
99def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
100def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
101def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
102def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
103def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
104def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
105
106def CmpMode : Operand<i32> {
107  let PrintMethod = "printCmpMode";
108}
109
110def F32ConstZero : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
111    return CurDAG->getTargetConstantFP(0.0, MVT::f32);
112  }]>;
113def F32ConstOne : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
114    return CurDAG->getTargetConstantFP(1.0, MVT::f32);
115  }]>;
116
117//===----------------------------------------------------------------------===//
118// NVPTX Instruction Predicate Definitions
119//===----------------------------------------------------------------------===//
120
121
122def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
123def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
124def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
125def useAtomRedG32forGen32 :
126  Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
127def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
128def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
129def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
130def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
131def useAtomRedG64forGen64 :
132  Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
133def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
134def hasVote : Predicate<"Subtarget->hasVote()">;
135def hasDouble : Predicate<"Subtarget->hasDouble()">;
136def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
137def hasLDG : Predicate<"Subtarget->hasLDG()">;
138def hasLDU : Predicate<"Subtarget->hasLDU()">;
139def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
140
141def doF32FTZ : Predicate<"useF32FTZ()">;
142def doNoF32FTZ : Predicate<"!useF32FTZ()">;
143
144def doMulWide      : Predicate<"doMulWide">;
145
146def allowFMA : Predicate<"allowFMA()">;
147def noFMA : Predicate<"!allowFMA()">;
148
149def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
150def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
151
152def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
153def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
154
155def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
156def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
157
158def true : Predicate<"1">;
159
160def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
161
162
163//===----------------------------------------------------------------------===//
164// Some Common Instruction Class Templates
165//===----------------------------------------------------------------------===//
166
167// Template for instructions which take three int64, int32, or int16 args.
168// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
169multiclass I3<string OpcStr, SDNode OpNode> {
170  def i64rr :
171    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
172              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
173              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
174  def i64ri :
175    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
176              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
177              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
178  def i32rr :
179    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
180              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
181              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
182  def i32ri :
183    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
184              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
185              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
186  def i16rr :
187    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
188              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
189              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
190  def i16ri :
191    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
192              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
193              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
194}
195
196// Template for instructions which take 3 int32 args.  The instructions are
197// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
198multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
199   def i32rr :
200     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
201               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
202               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
203   def i32ri :
204     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
205               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
206               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
207}
208
209// Template for instructions which take three fp64 or fp32 args.  The
210// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
211//
212// Also defines ftz (flush subnormal inputs and results to sign-preserving
213// zero) variants for fp32 functions.
214multiclass F3<string OpcStr, SDNode OpNode> {
215   def f64rr :
216     NVPTXInst<(outs Float64Regs:$dst),
217               (ins Float64Regs:$a, Float64Regs:$b),
218               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
219               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
220               Requires<[allowFMA]>;
221   def f64ri :
222     NVPTXInst<(outs Float64Regs:$dst),
223               (ins Float64Regs:$a, f64imm:$b),
224               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
225               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
226               Requires<[allowFMA]>;
227   def f32rr_ftz :
228     NVPTXInst<(outs Float32Regs:$dst),
229               (ins Float32Regs:$a, Float32Regs:$b),
230               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
231               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
232               Requires<[allowFMA, doF32FTZ]>;
233   def f32ri_ftz :
234     NVPTXInst<(outs Float32Regs:$dst),
235               (ins Float32Regs:$a, f32imm:$b),
236               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
237               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
238               Requires<[allowFMA, doF32FTZ]>;
239   def f32rr :
240     NVPTXInst<(outs Float32Regs:$dst),
241               (ins Float32Regs:$a, Float32Regs:$b),
242               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
243               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
244               Requires<[allowFMA]>;
245   def f32ri :
246     NVPTXInst<(outs Float32Regs:$dst),
247               (ins Float32Regs:$a, f32imm:$b),
248               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
249               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
250               Requires<[allowFMA]>;
251}
252
253// Same as F3, but defines ".rn" variants (round to nearest even).
254multiclass F3_rn<string OpcStr, SDNode OpNode> {
255   def f64rr :
256     NVPTXInst<(outs Float64Regs:$dst),
257               (ins Float64Regs:$a, Float64Regs:$b),
258               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
259               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
260               Requires<[noFMA]>;
261   def f64ri :
262     NVPTXInst<(outs Float64Regs:$dst),
263               (ins Float64Regs:$a, f64imm:$b),
264               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
265               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
266               Requires<[noFMA]>;
267   def f32rr_ftz :
268     NVPTXInst<(outs Float32Regs:$dst),
269               (ins Float32Regs:$a, Float32Regs:$b),
270               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
271               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
272               Requires<[noFMA, doF32FTZ]>;
273   def f32ri_ftz :
274     NVPTXInst<(outs Float32Regs:$dst),
275               (ins Float32Regs:$a, f32imm:$b),
276               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
277               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
278               Requires<[noFMA, doF32FTZ]>;
279   def f32rr :
280     NVPTXInst<(outs Float32Regs:$dst),
281               (ins Float32Regs:$a, Float32Regs:$b),
282               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
283               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
284               Requires<[noFMA]>;
285   def f32ri :
286     NVPTXInst<(outs Float32Regs:$dst),
287               (ins Float32Regs:$a, f32imm:$b),
288               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
289               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
290               Requires<[noFMA]>;
291}
292
293// Template for operations which take two f32 or f64 operands.  Provides three
294// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
295// subnormal inputs and results to zero).
296multiclass F2<string OpcStr, SDNode OpNode> {
297   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
298                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
299                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
300   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
301                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
302                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
303                           Requires<[doF32FTZ]>;
304   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
305                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
306                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
307}
308
309//===----------------------------------------------------------------------===//
310// NVPTX Instructions.
311//===----------------------------------------------------------------------===//
312
313//-----------------------------------
314// Type Conversion
315//-----------------------------------
316
317let hasSideEffects = 0 in {
318  // Generate a cvt to the given type from all possible types.  Each instance
319  // takes a CvtMode immediate that defines the conversion mode to use.  It can
320  // be CvtNONE to omit a conversion mode.
321  multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
322    def _s8 :
323      NVPTXInst<(outs RC:$dst),
324                (ins Int16Regs:$src, CvtMode:$mode),
325                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
326                FromName, ".s8\t$dst, $src;"), []>;
327    def _u8 :
328      NVPTXInst<(outs RC:$dst),
329                (ins Int16Regs:$src, CvtMode:$mode),
330                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
331                FromName, ".u8\t$dst, $src;"), []>;
332    def _s16 :
333      NVPTXInst<(outs RC:$dst),
334                (ins Int16Regs:$src, CvtMode:$mode),
335                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
336                FromName, ".s16\t$dst, $src;"), []>;
337    def _u16 :
338      NVPTXInst<(outs RC:$dst),
339                (ins Int16Regs:$src, CvtMode:$mode),
340                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
341                FromName, ".u16\t$dst, $src;"), []>;
342    def _f16 :
343      NVPTXInst<(outs RC:$dst),
344                (ins Int16Regs:$src, CvtMode:$mode),
345                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
346                FromName, ".f16\t$dst, $src;"), []>;
347    def _s32 :
348      NVPTXInst<(outs RC:$dst),
349                (ins Int32Regs:$src, CvtMode:$mode),
350                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
351                FromName, ".s32\t$dst, $src;"), []>;
352    def _u32 :
353      NVPTXInst<(outs RC:$dst),
354                (ins Int32Regs:$src, CvtMode:$mode),
355                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
356                FromName, ".u32\t$dst, $src;"), []>;
357    def _s64 :
358      NVPTXInst<(outs RC:$dst),
359                (ins Int64Regs:$src, CvtMode:$mode),
360                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
361                FromName, ".s64\t$dst, $src;"), []>;
362    def _u64 :
363      NVPTXInst<(outs RC:$dst),
364                (ins Int64Regs:$src, CvtMode:$mode),
365                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
366                FromName, ".u64\t$dst, $src;"), []>;
367    def _f32 :
368      NVPTXInst<(outs RC:$dst),
369                (ins Float32Regs:$src, CvtMode:$mode),
370                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
371                FromName, ".f32\t$dst, $src;"), []>;
372    def _f64 :
373      NVPTXInst<(outs RC:$dst),
374                (ins Float64Regs:$src, CvtMode:$mode),
375                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
376                FromName, ".f64\t$dst, $src;"), []>;
377  }
378
379  // Generate cvts from all types to all types.
380  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
381  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
382  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
383  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
384  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
385  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
386  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
387  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
388  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
389  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
390  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
391
392  // These cvts are different from those above: The source and dest registers
393  // are of the same type.
394  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
395                                    "cvt.s16.s8 \t$dst, $src;", []>;
396  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
397                                    "cvt.s32.s8 \t$dst, $src;", []>;
398  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
399                                    "cvt.s32.s16 \t$dst, $src;", []>;
400  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
401                                    "cvt.s64.s8 \t$dst, $src;", []>;
402  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
403                                    "cvt.s64.s16 \t$dst, $src;", []>;
404  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
405                                    "cvt.s64.s32 \t$dst, $src;", []>;
406}
407
408//-----------------------------------
409// Integer Arithmetic
410//-----------------------------------
411
412// Template for xor masquerading as int1 arithmetic.
413multiclass ADD_SUB_i1<SDNode OpNode> {
414   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
415                      "xor.pred \t$dst, $a, $b;",
416                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
417   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
418                      "xor.pred \t$dst, $a, $b;",
419                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
420}
421
422// int1 addition and subtraction are both just xor.
423defm ADD_i1 : ADD_SUB_i1<add>;
424defm SUB_i1 : ADD_SUB_i1<sub>;
425
426// int16, int32, and int64 signed addition.  Since nvptx is 2's compliment, we
427// also use these for unsigned arithmetic.
428defm ADD : I3<"add.s", add>;
429defm SUB : I3<"sub.s", sub>;
430
431// int32 addition and subtraction with carry-out.
432// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
433defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
434defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
435
436// int32 addition and subtraction with carry-in and carry-out.
437defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
438defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
439
440defm MULT : I3<"mul.lo.s", mul>;
441
442defm MULTHS : I3<"mul.hi.s", mulhs>;
443defm MULTHU : I3<"mul.hi.u", mulhu>;
444
445defm SDIV : I3<"div.s", sdiv>;
446defm UDIV : I3<"div.u", udiv>;
447
448// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
449// will lower it.
450defm SREM : I3<"rem.s", srem>;
451defm UREM : I3<"rem.u", urem>;
452
453
454//
455// Wide multiplication
456//
457def MULWIDES64 :
458  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
459            "mul.wide.s32 \t$dst, $a, $b;", []>;
460def MULWIDES64Imm :
461  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
462            "mul.wide.s32 \t$dst, $a, $b;", []>;
463def MULWIDES64Imm64 :
464  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
465            "mul.wide.s32 \t$dst, $a, $b;", []>;
466
467def MULWIDEU64 :
468  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
469            "mul.wide.u32 \t$dst, $a, $b;", []>;
470def MULWIDEU64Imm :
471  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
472            "mul.wide.u32 \t$dst, $a, $b;", []>;
473def MULWIDEU64Imm64 :
474  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
475            "mul.wide.u32 \t$dst, $a, $b;", []>;
476
477def MULWIDES32 :
478  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
479            "mul.wide.s16 \t$dst, $a, $b;", []>;
480def MULWIDES32Imm :
481  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
482            "mul.wide.s16 \t$dst, $a, $b;", []>;
483def MULWIDES32Imm32 :
484  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
485            "mul.wide.s16 \t$dst, $a, $b;", []>;
486
487def MULWIDEU32 :
488  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
489            "mul.wide.u16 \t$dst, $a, $b;", []>;
490def MULWIDEU32Imm :
491  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
492            "mul.wide.u16 \t$dst, $a, $b;", []>;
493def MULWIDEU32Imm32 :
494  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
495            "mul.wide.u16 \t$dst, $a, $b;", []>;
496
497def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
498def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
499def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
500
501// Matchers for signed, unsigned mul.wide ISD nodes.
502def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
503          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
504      Requires<[doMulWide]>;
505def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
506          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
507      Requires<[doMulWide]>;
508def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
509          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
510      Requires<[doMulWide]>;
511def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
512          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
513      Requires<[doMulWide]>;
514
515def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
516          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
517      Requires<[doMulWide]>;
518def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
519          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
520      Requires<[doMulWide]>;
521def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
522          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
523      Requires<[doMulWide]>;
524def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
525          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
526      Requires<[doMulWide]>;
527
528// Predicates used for converting some patterns to mul.wide.
529def SInt32Const : PatLeaf<(imm), [{
530  const APInt &v = N->getAPIntValue();
531  return v.isSignedIntN(32);
532}]>;
533
534def UInt32Const : PatLeaf<(imm), [{
535  const APInt &v = N->getAPIntValue();
536  return v.isIntN(32);
537}]>;
538
539def SInt16Const : PatLeaf<(imm), [{
540  const APInt &v = N->getAPIntValue();
541  return v.isSignedIntN(16);
542}]>;
543
544def UInt16Const : PatLeaf<(imm), [{
545  const APInt &v = N->getAPIntValue();
546  return v.isIntN(16);
547}]>;
548
549def Int5Const : PatLeaf<(imm), [{
550  // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
551  const APInt &v = N->getAPIntValue();
552  return v.sge(0) && v.slt(32);
553}]>;
554
555def Int4Const : PatLeaf<(imm), [{
556  // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
557  const APInt &v = N->getAPIntValue();
558  return v.sge(0) && v.slt(16);
559}]>;
560
561def SHL2MUL32 : SDNodeXForm<imm, [{
562  const APInt &v = N->getAPIntValue();
563  APInt temp(32, 1);
564  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
565}]>;
566
567def SHL2MUL16 : SDNodeXForm<imm, [{
568  const APInt &v = N->getAPIntValue();
569  APInt temp(16, 1);
570  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
571}]>;
572
573// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
574def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
575          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
576      Requires<[doMulWide]>;
577def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
578          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
579      Requires<[doMulWide]>;
580
581def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
582          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
583      Requires<[doMulWide]>;
584def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
585          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
586      Requires<[doMulWide]>;
587
588// Convert "sign/zero-extend then multiply" to mul.wide.
589def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
590          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
591      Requires<[doMulWide]>;
592def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
593          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
594      Requires<[doMulWide]>;
595
596def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
597          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
598      Requires<[doMulWide]>;
599def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
600          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
601      Requires<[doMulWide]>;
602
603def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
604          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
605      Requires<[doMulWide]>;
606def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
607          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
608      Requires<[doMulWide]>;
609
610def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
611          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
612      Requires<[doMulWide]>;
613def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
614          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
615      Requires<[doMulWide]>;
616
617//
618// Integer multiply-add
619//
620def SDTIMAD :
621  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
622                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
623def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
624
625def MAD16rrr :
626  NVPTXInst<(outs Int16Regs:$dst),
627            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
628            "mad.lo.s16 \t$dst, $a, $b, $c;",
629            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
630def MAD16rri :
631  NVPTXInst<(outs Int16Regs:$dst),
632            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
633            "mad.lo.s16 \t$dst, $a, $b, $c;",
634            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
635def MAD16rir :
636  NVPTXInst<(outs Int16Regs:$dst),
637            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
638            "mad.lo.s16 \t$dst, $a, $b, $c;",
639            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
640def MAD16rii :
641  NVPTXInst<(outs Int16Regs:$dst),
642            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
643            "mad.lo.s16 \t$dst, $a, $b, $c;",
644            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
645
646def MAD32rrr :
647  NVPTXInst<(outs Int32Regs:$dst),
648            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
649            "mad.lo.s32 \t$dst, $a, $b, $c;",
650            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
651def MAD32rri :
652  NVPTXInst<(outs Int32Regs:$dst),
653            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
654            "mad.lo.s32 \t$dst, $a, $b, $c;",
655            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
656def MAD32rir :
657  NVPTXInst<(outs Int32Regs:$dst),
658            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
659            "mad.lo.s32 \t$dst, $a, $b, $c;",
660            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
661def MAD32rii :
662  NVPTXInst<(outs Int32Regs:$dst),
663            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
664            "mad.lo.s32 \t$dst, $a, $b, $c;",
665            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
666
667def MAD64rrr :
668  NVPTXInst<(outs Int64Regs:$dst),
669            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
670            "mad.lo.s64 \t$dst, $a, $b, $c;",
671            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
672def MAD64rri :
673  NVPTXInst<(outs Int64Regs:$dst),
674            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
675            "mad.lo.s64 \t$dst, $a, $b, $c;",
676            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
677def MAD64rir :
678  NVPTXInst<(outs Int64Regs:$dst),
679            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
680            "mad.lo.s64 \t$dst, $a, $b, $c;",
681            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
682def MAD64rii :
683  NVPTXInst<(outs Int64Regs:$dst),
684            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
685            "mad.lo.s64 \t$dst, $a, $b, $c;",
686            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
687
688def INEG16 :
689  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
690            "neg.s16 \t$dst, $src;",
691            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
692def INEG32 :
693  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
694            "neg.s32 \t$dst, $src;",
695            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
696def INEG64 :
697  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
698            "neg.s64 \t$dst, $src;",
699            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
700
701//-----------------------------------
702// Floating Point Arithmetic
703//-----------------------------------
704
705// Constant 1.0f
706def FloatConst1 : PatLeaf<(fpimm), [{
707  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle &&
708         N->getValueAPF().convertToFloat() == 1.0f;
709}]>;
710// Constant 1.0 (double)
711def DoubleConst1 : PatLeaf<(fpimm), [{
712  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble &&
713         N->getValueAPF().convertToDouble() == 1.0;
714}]>;
715
716defm FADD : F3<"add", fadd>;
717defm FSUB : F3<"sub", fsub>;
718defm FMUL : F3<"mul", fmul>;
719
720defm FADD_rn : F3_rn<"add", fadd>;
721defm FSUB_rn : F3_rn<"sub", fsub>;
722defm FMUL_rn : F3_rn<"mul", fmul>;
723
724defm FABS  : F2<"abs", fabs>;
725defm FNEG  : F2<"neg", fneg>;
726defm FSQRT : F2<"sqrt.rn", fsqrt>;
727
728//
729// F64 division
730//
731def FDIV641r :
732  NVPTXInst<(outs Float64Regs:$dst),
733            (ins f64imm:$a, Float64Regs:$b),
734            "rcp.rn.f64 \t$dst, $b;",
735            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
736def FDIV64rr :
737  NVPTXInst<(outs Float64Regs:$dst),
738            (ins Float64Regs:$a, Float64Regs:$b),
739            "div.rn.f64 \t$dst, $a, $b;",
740            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
741def FDIV64ri :
742  NVPTXInst<(outs Float64Regs:$dst),
743            (ins Float64Regs:$a, f64imm:$b),
744            "div.rn.f64 \t$dst, $a, $b;",
745            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
746
747//
748// F32 Approximate reciprocal
749//
750def FDIV321r_ftz :
751  NVPTXInst<(outs Float32Regs:$dst),
752            (ins f32imm:$a, Float32Regs:$b),
753            "rcp.approx.ftz.f32 \t$dst, $b;",
754            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
755            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
756def FDIV321r :
757  NVPTXInst<(outs Float32Regs:$dst),
758            (ins f32imm:$a, Float32Regs:$b),
759            "rcp.approx.f32 \t$dst, $b;",
760            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
761            Requires<[do_DIVF32_APPROX]>;
762//
763// F32 Approximate division
764//
765def FDIV32approxrr_ftz :
766  NVPTXInst<(outs Float32Regs:$dst),
767            (ins Float32Regs:$a, Float32Regs:$b),
768            "div.approx.ftz.f32 \t$dst, $a, $b;",
769            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
770            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
771def FDIV32approxri_ftz :
772  NVPTXInst<(outs Float32Regs:$dst),
773            (ins Float32Regs:$a, f32imm:$b),
774            "div.approx.ftz.f32 \t$dst, $a, $b;",
775            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
776            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
777def FDIV32approxrr :
778  NVPTXInst<(outs Float32Regs:$dst),
779            (ins Float32Regs:$a, Float32Regs:$b),
780            "div.approx.f32 \t$dst, $a, $b;",
781            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
782            Requires<[do_DIVF32_APPROX]>;
783def FDIV32approxri :
784  NVPTXInst<(outs Float32Regs:$dst),
785            (ins Float32Regs:$a, f32imm:$b),
786            "div.approx.f32 \t$dst, $a, $b;",
787            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
788            Requires<[do_DIVF32_APPROX]>;
789//
790// F32 Semi-accurate reciprocal
791//
792// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
793//
794def FDIV321r_approx_ftz :
795  NVPTXInst<(outs Float32Regs:$dst),
796            (ins f32imm:$a, Float32Regs:$b),
797            "rcp.approx.ftz.f32 \t$dst, $b;",
798            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
799            Requires<[do_DIVF32_FULL, doF32FTZ]>;
800def FDIV321r_approx :
801  NVPTXInst<(outs Float32Regs:$dst),
802            (ins f32imm:$a, Float32Regs:$b),
803            "rcp.approx.f32 \t$dst, $b;",
804            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
805            Requires<[do_DIVF32_FULL]>;
806//
807// F32 Semi-accurate division
808//
809def FDIV32rr_ftz :
810  NVPTXInst<(outs Float32Regs:$dst),
811            (ins Float32Regs:$a, Float32Regs:$b),
812            "div.full.ftz.f32 \t$dst, $a, $b;",
813            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
814            Requires<[do_DIVF32_FULL, doF32FTZ]>;
815def FDIV32ri_ftz :
816  NVPTXInst<(outs Float32Regs:$dst),
817            (ins Float32Regs:$a, f32imm:$b),
818            "div.full.ftz.f32 \t$dst, $a, $b;",
819            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
820            Requires<[do_DIVF32_FULL, doF32FTZ]>;
821def FDIV32rr :
822  NVPTXInst<(outs Float32Regs:$dst),
823            (ins Float32Regs:$a, Float32Regs:$b),
824            "div.full.f32 \t$dst, $a, $b;",
825            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
826            Requires<[do_DIVF32_FULL]>;
827def FDIV32ri :
828  NVPTXInst<(outs Float32Regs:$dst),
829            (ins Float32Regs:$a, f32imm:$b),
830            "div.full.f32 \t$dst, $a, $b;",
831            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
832            Requires<[do_DIVF32_FULL]>;
833//
834// F32 Accurate reciprocal
835//
836def FDIV321r_prec_ftz :
837  NVPTXInst<(outs Float32Regs:$dst),
838            (ins f32imm:$a, Float32Regs:$b),
839            "rcp.rn.ftz.f32 \t$dst, $b;",
840            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
841            Requires<[reqPTX20, doF32FTZ]>;
842def FDIV321r_prec :
843  NVPTXInst<(outs Float32Regs:$dst),
844            (ins f32imm:$a, Float32Regs:$b),
845            "rcp.rn.f32 \t$dst, $b;",
846            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
847            Requires<[reqPTX20]>;
848//
849// F32 Accurate division
850//
851def FDIV32rr_prec_ftz :
852  NVPTXInst<(outs Float32Regs:$dst),
853            (ins Float32Regs:$a, Float32Regs:$b),
854            "div.rn.ftz.f32 \t$dst, $a, $b;",
855            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
856            Requires<[doF32FTZ, reqPTX20]>;
857def FDIV32ri_prec_ftz :
858  NVPTXInst<(outs Float32Regs:$dst),
859            (ins Float32Regs:$a, f32imm:$b),
860            "div.rn.ftz.f32 \t$dst, $a, $b;",
861            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
862            Requires<[doF32FTZ, reqPTX20]>;
863def FDIV32rr_prec :
864  NVPTXInst<(outs Float32Regs:$dst),
865            (ins Float32Regs:$a, Float32Regs:$b),
866            "div.rn.f32 \t$dst, $a, $b;",
867            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
868            Requires<[reqPTX20]>;
869def FDIV32ri_prec :
870  NVPTXInst<(outs Float32Regs:$dst),
871            (ins Float32Regs:$a, f32imm:$b),
872            "div.rn.f32 \t$dst, $a, $b;",
873            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
874            Requires<[reqPTX20]>;
875
876//
877// F32 rsqrt
878//
879
880def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
881                       "rsqrt.approx.f32 \t$dst, $b;", []>;
882
883// Convert 1.0f/sqrt(x) to rsqrt.approx.f32.  (There is an rsqrt.approx.f64, but
884// it's emulated in software.)
885def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
886         (RSQRTF32approx1r Float32Regs:$b)>,
887         Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
888
889multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
890   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
891                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
892                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
893                       Requires<[Pred]>;
894   def rri : NVPTXInst<(outs RC:$dst),
895                       (ins RC:$a, RC:$b, ImmCls:$c),
896                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
897                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
898                       Requires<[Pred]>;
899   def rir : NVPTXInst<(outs RC:$dst),
900                       (ins RC:$a, ImmCls:$b, RC:$c),
901                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
902                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
903                       Requires<[Pred]>;
904   def rii : NVPTXInst<(outs RC:$dst),
905                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
906                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
907                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
908                       Requires<[Pred]>;
909}
910
911defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
912defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
913defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
914
915// sin/cos
916def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
917                      "sin.approx.f32 \t$dst, $src;",
918                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
919def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
920                      "cos.approx.f32 \t$dst, $src;",
921                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
922
923// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
924// i.e. "poor man's fmod()"
925
926// frem - f32 FTZ
927def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
928          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
929            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
930             Float32Regs:$y))>,
931          Requires<[doF32FTZ]>;
932def : Pat<(frem Float32Regs:$x, fpimm:$y),
933          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
934            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
935             fpimm:$y))>,
936          Requires<[doF32FTZ]>;
937
938// frem - f32
939def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
940          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
941            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
942             Float32Regs:$y))>;
943def : Pat<(frem Float32Regs:$x, fpimm:$y),
944          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
945            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
946             fpimm:$y))>;
947
948// frem - f64
949def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
950          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
951            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
952             Float64Regs:$y))>;
953def : Pat<(frem Float64Regs:$x, fpimm:$y),
954          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
955            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
956             fpimm:$y))>;
957
958//-----------------------------------
959// Bitwise operations
960//-----------------------------------
961
962// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
963// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
964multiclass BITWISE<string OpcStr, SDNode OpNode> {
965  def b1rr :
966    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
967              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
968              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
969  def b1ri :
970    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
971              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
972              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
973  def b16rr :
974    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
975              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
976              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
977  def b16ri :
978    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
979              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
980              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
981  def b32rr :
982    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
983              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
984              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
985  def b32ri :
986    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
987              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
988              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
989  def b64rr :
990    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
991              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
992              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
993  def b64ri :
994    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
995              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
996              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
997}
998
999defm OR  : BITWISE<"or", or>;
1000defm AND : BITWISE<"and", and>;
1001defm XOR : BITWISE<"xor", xor>;
1002
1003def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1004                      "not.pred \t$dst, $src;",
1005                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
1006def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1007                      "not.b16 \t$dst, $src;",
1008                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
1009def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1010                      "not.b32 \t$dst, $src;",
1011                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
1012def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1013                       "not.b64 \t$dst, $src;",
1014                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
1015
1016// Template for left/right shifts.  Takes three operands,
1017//   [dest (reg), src (reg), shift (reg or imm)].
1018// dest and src may be int64, int32, or int16, but shift is always int32.
1019//
1020// This template also defines a 32-bit shift (imm, imm) instruction.
1021multiclass SHIFT<string OpcStr, SDNode OpNode> {
1022   def i64rr :
1023     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1024               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1025               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
1026   def i64ri :
1027     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1028               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1029               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
1030   def i32rr :
1031     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1032               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1033               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
1034   def i32ri :
1035     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1036               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1037               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
1038   def i32ii :
1039     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1040               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1041               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1042   def i16rr :
1043     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1044               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1045               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
1046   def i16ri :
1047     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1048               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1049               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
1050}
1051
1052defm SHL : SHIFT<"shl.b", shl>;
1053defm SRA : SHIFT<"shr.s", sra>;
1054defm SRL : SHIFT<"shr.u", srl>;
1055
1056//
1057// Rotate: Use ptx shf instruction if available.
1058//
1059
1060// 32 bit r2 = rotl r1, n
1061//    =>
1062//        r2 = shf.l r1, r1, n
1063def ROTL32imm_hw :
1064  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1065            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1066            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
1067           Requires<[hasHWROT32]>;
1068
1069def ROTL32reg_hw :
1070  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1071            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1072            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1073           Requires<[hasHWROT32]>;
1074
1075// 32 bit r2 = rotr r1, n
1076//    =>
1077//        r2 = shf.r r1, r1, n
1078def ROTR32imm_hw :
1079  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
1080            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1081            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
1082           Requires<[hasHWROT32]>;
1083
1084def ROTR32reg_hw :
1085  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1086            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1087            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1088           Requires<[hasHWROT32]>;
1089
1090// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
1091def ROT32imm_sw :
1092  NVPTXInst<(outs Int32Regs:$dst),
1093            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
1094            "{{\n\t"
1095            ".reg .b32 %lhs;\n\t"
1096            ".reg .b32 %rhs;\n\t"
1097            "shl.b32 \t%lhs, $src, $amt1;\n\t"
1098            "shr.b32 \t%rhs, $src, $amt2;\n\t"
1099            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1100            "}}",
1101            []>;
1102
1103def SUB_FRM_32 : SDNodeXForm<imm, [{
1104  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
1105}]>;
1106
1107def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
1108          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1109      Requires<[noHWROT32]>;
1110def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
1111          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1112      Requires<[noHWROT32]>;
1113
1114// 32-bit software rotate left by register.
1115def ROTL32reg_sw :
1116  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1117            "{{\n\t"
1118            ".reg .b32 %lhs;\n\t"
1119            ".reg .b32 %rhs;\n\t"
1120            ".reg .b32 %amt2;\n\t"
1121            "shl.b32 \t%lhs, $src, $amt;\n\t"
1122            "sub.s32 \t%amt2, 32, $amt;\n\t"
1123            "shr.b32 \t%rhs, $src, %amt2;\n\t"
1124            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1125            "}}",
1126            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1127           Requires<[noHWROT32]>;
1128
1129// 32-bit software rotate right by register.
1130def ROTR32reg_sw :
1131  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
1132            "{{\n\t"
1133            ".reg .b32 %lhs;\n\t"
1134            ".reg .b32 %rhs;\n\t"
1135            ".reg .b32 %amt2;\n\t"
1136            "shr.b32 \t%lhs, $src, $amt;\n\t"
1137            "sub.s32 \t%amt2, 32, $amt;\n\t"
1138            "shl.b32 \t%rhs, $src, %amt2;\n\t"
1139            "add.u32 \t$dst, %lhs, %rhs;\n\t"
1140            "}}",
1141            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1142           Requires<[noHWROT32]>;
1143
1144// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
1145def ROT64imm_sw :
1146  NVPTXInst<(outs Int64Regs:$dst),
1147            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
1148            "{{\n\t"
1149            ".reg .b64 %lhs;\n\t"
1150            ".reg .b64 %rhs;\n\t"
1151            "shl.b64 \t%lhs, $src, $amt1;\n\t"
1152            "shr.b64 \t%rhs, $src, $amt2;\n\t"
1153            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1154            "}}",
1155            []>;
1156
1157def SUB_FRM_64 : SDNodeXForm<imm, [{
1158    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
1159}]>;
1160
1161def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
1162          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
1163def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
1164          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
1165
1166// 64-bit software rotate left by register.
1167def ROTL64reg_sw :
1168  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1169            "{{\n\t"
1170            ".reg .b64 %lhs;\n\t"
1171            ".reg .b64 %rhs;\n\t"
1172            ".reg .u32 %amt2;\n\t"
1173            "shl.b64 \t%lhs, $src, $amt;\n\t"
1174            "sub.u32 \t%amt2, 64, $amt;\n\t"
1175            "shr.b64 \t%rhs, $src, %amt2;\n\t"
1176            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1177            "}}",
1178            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
1179
1180def ROTR64reg_sw :
1181  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
1182            "{{\n\t"
1183            ".reg .b64 %lhs;\n\t"
1184            ".reg .b64 %rhs;\n\t"
1185            ".reg .u32 %amt2;\n\t"
1186            "shr.b64 \t%lhs, $src, $amt;\n\t"
1187            "sub.u32 \t%amt2, 64, $amt;\n\t"
1188            "shl.b64 \t%rhs, $src, %amt2;\n\t"
1189            "add.u64 \t$dst, %lhs, %rhs;\n\t"
1190            "}}",
1191            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
1192
1193//
1194// Funnnel shift in clamp mode
1195//
1196
1197// Create SDNodes so they can be used in the DAG code, e.g.
1198// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
1199def SDTIntShiftDOp :
1200  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1201                       SDTCisInt<0>, SDTCisInt<3>]>;
1202def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
1203def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
1204
1205def FUNSHFLCLAMP :
1206  NVPTXInst<(outs Int32Regs:$dst),
1207            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1208            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
1209            [(set Int32Regs:$dst,
1210              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1211
1212def FUNSHFRCLAMP :
1213  NVPTXInst<(outs Int32Regs:$dst),
1214            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1215            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
1216            [(set Int32Regs:$dst,
1217             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
1218
1219//
1220// BFE - bit-field extract
1221//
1222
1223// Template for BFE instructions.  Takes four args,
1224//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1225// Start may be an imm only if end is also an imm.  FIXME: Is this a
1226// restriction in PTX?
1227//
1228// dest and src may be int32 or int64, but start and end are always int32.
1229multiclass BFE<string TyStr, RegisterClass RC> {
1230  def rrr
1231    : NVPTXInst<(outs RC:$d),
1232                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1233                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1234  def rri
1235    : NVPTXInst<(outs RC:$d),
1236                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1237                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1238  def rii
1239    : NVPTXInst<(outs RC:$d),
1240                (ins RC:$a, i32imm:$b, i32imm:$c),
1241                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
1242}
1243
1244let hasSideEffects = 0 in {
1245  defm BFE_S32 : BFE<"s32", Int32Regs>;
1246  defm BFE_U32 : BFE<"u32", Int32Regs>;
1247  defm BFE_S64 : BFE<"s64", Int64Regs>;
1248  defm BFE_U64 : BFE<"u64", Int64Regs>;
1249}
1250
1251//-----------------------------------
1252// Comparison instructions (setp, set)
1253//-----------------------------------
1254
1255// FIXME: This doesn't cover versions of set and setp that combine with a
1256// boolean predicate, e.g. setp.eq.and.b16.
1257
1258let hasSideEffects = 0 in {
1259  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1260    def rr :
1261      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1262                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1263                           "\t$dst, $a, $b;"), []>;
1264    def ri :
1265      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1266                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1267                           "\t$dst, $a, $b;"), []>;
1268    def ir :
1269      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1270                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1271                           "\t$dst, $a, $b;"), []>;
1272  }
1273}
1274
1275defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1276defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1277defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1278defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1279defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1280defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1281defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1282defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1283defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1284defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1285defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1286
1287// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
1288// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1289// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
1290
1291let hasSideEffects = 0 in {
1292  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1293    def rr : NVPTXInst<(outs Int32Regs:$dst),
1294                       (ins RC:$a, RC:$b, CmpMode:$cmp),
1295                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
1296    def ri : NVPTXInst<(outs Int32Regs:$dst),
1297                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1298                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
1299    def ir : NVPTXInst<(outs Int32Regs:$dst),
1300                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1301                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
1302  }
1303}
1304
1305defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1306defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
1307defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
1308defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
1309defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
1310defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
1311defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
1312defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
1313defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
1314defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
1315defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
1316
1317//-----------------------------------
1318// Selection instructions (selp)
1319//-----------------------------------
1320
1321// FIXME: Missing slct
1322
1323// selp instructions that don't have any pattern matches; we explicitly use
1324// them within this file.
1325let hasSideEffects = 0 in {
1326  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1327    def rr : NVPTXInst<(outs RC:$dst),
1328                       (ins RC:$a, RC:$b, Int1Regs:$p),
1329                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
1330    def ri : NVPTXInst<(outs RC:$dst),
1331                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1332                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
1333    def ir : NVPTXInst<(outs RC:$dst),
1334                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1335                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
1336    def ii : NVPTXInst<(outs RC:$dst),
1337                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1338                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
1339  }
1340
1341  multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
1342                          SDNode ImmNode> {
1343    def rr :
1344      NVPTXInst<(outs RC:$dst),
1345                (ins RC:$a, RC:$b, Int1Regs:$p),
1346                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
1347                [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
1348    def ri :
1349      NVPTXInst<(outs RC:$dst),
1350                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
1351                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
1352                [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
1353    def ir :
1354      NVPTXInst<(outs RC:$dst),
1355                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
1356                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
1357                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
1358    def ii :
1359      NVPTXInst<(outs RC:$dst),
1360                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
1361                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
1362                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
1363  }
1364}
1365
1366// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
1367// good.
1368defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
1369defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
1370defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
1371defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
1372defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
1373defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
1374defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
1375defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
1376defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
1377defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
1378defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
1379
1380//-----------------------------------
1381// Data Movement (Load / Store, Move)
1382//-----------------------------------
1383
1384def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
1385                            [SDNPWantRoot]>;
1386def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
1387                              [SDNPWantRoot]>;
1388
1389def MEMri : Operand<i32> {
1390  let PrintMethod = "printMemOperand";
1391  let MIOperandInfo = (ops Int32Regs, i32imm);
1392}
1393def MEMri64 : Operand<i64> {
1394  let PrintMethod = "printMemOperand";
1395  let MIOperandInfo = (ops Int64Regs, i64imm);
1396}
1397
1398def imem : Operand<iPTR> {
1399  let PrintMethod = "printOperand";
1400}
1401
1402def imemAny : Operand<iPTRAny> {
1403  let PrintMethod = "printOperand";
1404}
1405
1406def LdStCode : Operand<i32> {
1407  let PrintMethod = "printLdStCode";
1408}
1409
1410def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
1411def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
1412
1413// Load a memory address into a u32 or u64 register.
1414def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
1415                         "mov.u32 \t$dst, $a;",
1416                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1417def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
1418                           "mov.u64 \t$dst, $a;",
1419                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
1420
1421// Get pointer to local stack.
1422let hasSideEffects = 0 in {
1423  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
1424                                     "mov.u32 \t$d, __local_depot$num;", []>;
1425  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
1426                                    "mov.u64 \t$d, __local_depot$num;", []>;
1427}
1428
1429
1430// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1431let IsSimpleMove=1, hasSideEffects=0 in {
1432  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
1433                           "mov.pred \t$dst, $sss;", []>;
1434  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1435                           "mov.u16 \t$dst, $sss;", []>;
1436  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1437                           "mov.u32 \t$dst, $sss;", []>;
1438  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1439                           "mov.u64 \t$dst, $sss;", []>;
1440
1441  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1442                           "mov.f32 \t$dst, $src;", []>;
1443  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
1444                           "mov.f64 \t$dst, $src;", []>;
1445}
1446
1447def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1448                        "mov.pred \t$dst, $src;",
1449                        [(set Int1Regs:$dst, imm:$src)]>;
1450def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1451                         "mov.u16 \t$dst, $src;",
1452                         [(set Int16Regs:$dst, imm:$src)]>;
1453def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1454                         "mov.u32 \t$dst, $src;",
1455                         [(set Int32Regs:$dst, imm:$src)]>;
1456def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
1457                        "mov.u64 \t$dst, $src;",
1458                        [(set Int64Regs:$dst, imm:$src)]>;
1459
1460def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
1461                         "mov.f32 \t$dst, $src;",
1462                         [(set Float32Regs:$dst, fpimm:$src)]>;
1463def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
1464                         "mov.f64 \t$dst, $src;",
1465                         [(set Float64Regs:$dst, fpimm:$src)]>;
1466
1467def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
1468
1469//---- Copy Frame Index ----
1470def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
1471                            "add.u32 \t$dst, ${addr:add};",
1472                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
1473def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
1474                            "add.u64 \t$dst, ${addr:add};",
1475                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
1476
1477//-----------------------------------
1478// Comparison and Selection
1479//-----------------------------------
1480
1481multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
1482                       Instruction setp_16rr,
1483                       Instruction setp_16ri,
1484                       Instruction setp_16ir,
1485                       Instruction setp_32rr,
1486                       Instruction setp_32ri,
1487                       Instruction setp_32ir,
1488                       Instruction setp_64rr,
1489                       Instruction setp_64ri,
1490                       Instruction setp_64ir,
1491                       Instruction set_16rr,
1492                       Instruction set_16ri,
1493                       Instruction set_16ir,
1494                       Instruction set_32rr,
1495                       Instruction set_32ri,
1496                       Instruction set_32ir,
1497                       Instruction set_64rr,
1498                       Instruction set_64ri,
1499                       Instruction set_64ir> {
1500  // i16 -> pred
1501  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
1502            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1503  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
1504            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
1505  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
1506            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
1507  // i32 -> pred
1508  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
1509            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1510  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
1511            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
1512  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
1513            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
1514  // i64 -> pred
1515  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
1516            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1517  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
1518            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
1519  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
1520            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
1521
1522  // i16 -> i32
1523  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
1524            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
1525  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
1526            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
1527  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
1528            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
1529  // i32 -> i32
1530  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
1531            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
1532  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
1533            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
1534  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
1535            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
1536  // i64 -> i32
1537  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
1538            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
1539  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
1540            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
1541  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
1542            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
1543}
1544
1545multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
1546  : ISET_FORMAT<OpNode, Mode,
1547                SETP_s16rr, SETP_s16ri, SETP_s16ir,
1548                SETP_s32rr, SETP_s32ri, SETP_s32ir,
1549                SETP_s64rr, SETP_s64ri, SETP_s64ir,
1550                SET_s16rr, SET_s16ri, SET_s16ir,
1551                SET_s32rr, SET_s32ri, SET_s32ir,
1552                SET_s64rr, SET_s64ri, SET_s64ir> {
1553  // TableGen doesn't like empty multiclasses.
1554  def : PatLeaf<(i32 0)>;
1555}
1556
1557multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
1558  : ISET_FORMAT<OpNode, Mode,
1559                SETP_u16rr, SETP_u16ri, SETP_u16ir,
1560                SETP_u32rr, SETP_u32ri, SETP_u32ir,
1561                SETP_u64rr, SETP_u64ri, SETP_u64ir,
1562                SET_u16rr, SET_u16ri, SET_u16ir,
1563                SET_u32rr, SET_u32ri, SET_u32ir,
1564                SET_u64rr, SET_u64ri, SET_u64ir> {
1565  // TableGen doesn't like empty multiclasses.
1566  def : PatLeaf<(i32 0)>;
1567}
1568
1569defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
1570defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
1571defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
1572defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
1573defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
1574defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
1575defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
1576defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
1577defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
1578defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
1579defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
1580defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
1581
1582// i1 compares
1583def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
1584          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1585def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
1586          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
1587
1588def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
1589          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1590def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
1591          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1592
1593// i1 compare -> i32
1594def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1595          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1596def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
1597          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
1598
1599
1600
1601multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
1602  // f32 -> pred
1603  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1604            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1605        Requires<[doF32FTZ]>;
1606  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
1607            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1608  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1609            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1610        Requires<[doF32FTZ]>;
1611  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
1612            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1613  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1614            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1615        Requires<[doF32FTZ]>;
1616  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
1617            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1618
1619  // f64 -> pred
1620  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
1621            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1622  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
1623            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1624  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
1625            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1626
1627  // f32 -> i32
1628  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1629            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
1630        Requires<[doF32FTZ]>;
1631  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
1632            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
1633  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1634            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
1635        Requires<[doF32FTZ]>;
1636  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
1637            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
1638  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1639            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
1640        Requires<[doF32FTZ]>;
1641  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
1642            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
1643
1644  // f64 -> i32
1645  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
1646            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
1647  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
1648            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
1649  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
1650            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
1651}
1652
1653defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
1654defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
1655defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
1656defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
1657defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
1658defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
1659
1660defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
1661defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
1662defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
1663defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
1664defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
1665defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
1666
1667defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
1668defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
1669defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
1670defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
1671defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
1672defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
1673
1674defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
1675defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
1676
1677// FIXME: What is this doing here?  Can it be deleted?
1678// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
1679//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
1680
1681def SDTDeclareParamProfile :
1682  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1683def SDTDeclareScalarParamProfile :
1684  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
1685def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
1686def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
1687def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
1688def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1689def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
1690def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1691def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
1692def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
1693def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
1694def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1695def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
1696def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
1697def SDTCallValProfile : SDTypeProfile<1, 0, []>;
1698def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
1699def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
1700def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
1701def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
1702def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
1703
1704def DeclareParam :
1705  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
1706         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1707def DeclareScalarParam :
1708  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
1709         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1710def DeclareRetParam :
1711  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
1712         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1713def DeclareRet :
1714  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
1715         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1716def LoadParam :
1717  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
1718         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1719def LoadParamV2 :
1720  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
1721         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1722def LoadParamV4 :
1723  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
1724         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
1725def PrintCall :
1726  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
1727         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1728def PrintConvergentCall :
1729  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
1730         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1731def PrintCallUni :
1732  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
1733         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1734def PrintConvergentCallUni :
1735  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
1736         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1737def StoreParam :
1738  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
1739         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1740def StoreParamV2 :
1741  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
1742         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1743def StoreParamV4 :
1744  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
1745         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1746def StoreParamU32 :
1747  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
1748         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1749def StoreParamS32 :
1750  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
1751         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1752def CallArgBegin :
1753  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
1754         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1755def CallArg :
1756  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
1757         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1758def LastCallArg :
1759  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
1760         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1761def CallArgEnd :
1762  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
1763         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1764def CallVoid :
1765  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
1766         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1767def Prototype :
1768  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
1769         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1770def CallVal :
1771  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
1772         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1773def MoveParam :
1774  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
1775def StoreRetval :
1776  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
1777         [SDNPHasChain, SDNPSideEffect]>;
1778def StoreRetvalV2 :
1779  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
1780         [SDNPHasChain, SDNPSideEffect]>;
1781def StoreRetvalV4 :
1782  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
1783         [SDNPHasChain, SDNPSideEffect]>;
1784def PseudoUseParam :
1785  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
1786         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
1787def RETURNNode :
1788  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
1789         [SDNPHasChain, SDNPSideEffect]>;
1790
1791let mayLoad = 1 in {
1792  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
1793        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
1794                  !strconcat(!strconcat("ld.param", opstr),
1795                             "\t$dst, [retval0+$b];"),
1796                  []>;
1797
1798  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
1799        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
1800                  !strconcat("ld.param.v2", opstr,
1801                             "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
1802
1803  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
1804        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
1805                        regclass:$dst4),
1806                  (ins i32imm:$b),
1807                  !strconcat("ld.param.v4", opstr,
1808                             "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
1809                  []>;
1810}
1811
1812class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
1813      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
1814                !strconcat("mov", opstr, "\t$dst, retval$b;"),
1815                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
1816
1817let mayStore = 1 in {
1818  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
1819        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
1820                  !strconcat("st.param", opstr, "\t[param$a+$b], $val;"),
1821                  []>;
1822
1823  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
1824        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
1825                               i32imm:$a, i32imm:$b),
1826                  !strconcat("st.param.v2", opstr,
1827                             "\t[param$a+$b], {{$val, $val2}};"),
1828                  []>;
1829
1830  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
1831        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
1832                               regclass:$val4, i32imm:$a,
1833                               i32imm:$b),
1834                  !strconcat("st.param.v4", opstr,
1835                             "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
1836                  []>;
1837
1838  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
1839        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
1840                  !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"),
1841                  []>;
1842
1843  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
1844        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
1845                  !strconcat("st.param.v2", opstr,
1846                             "\t[func_retval0+$a], {{$val, $val2}};"),
1847                  []>;
1848
1849  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
1850        NVPTXInst<(outs),
1851                  (ins regclass:$val, regclass:$val2, regclass:$val3,
1852                       regclass:$val4, i32imm:$a),
1853                  !strconcat("st.param.v4", opstr,
1854                             "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
1855                  []>;
1856}
1857
1858let isCall=1 in {
1859  multiclass CALL<string OpcStr, SDNode OpNode> {
1860     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
1861       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
1862     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
1863       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
1864     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
1865       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
1866     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
1867       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
1868     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
1869       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
1870       [(OpNode (i32 4))]>;
1871     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
1872       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
1873       [(OpNode (i32 5))]>;
1874     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
1875       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
1876                            "retval5), "),
1877       [(OpNode (i32 6))]>;
1878     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
1879       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
1880                            "retval5, retval6), "),
1881       [(OpNode (i32 7))]>;
1882     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
1883       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
1884                            "retval5, retval6, retval7), "),
1885       [(OpNode (i32 8))]>;
1886  }
1887}
1888
1889defm Call : CALL<"call", PrintCall>;
1890defm CallUni : CALL<"call.uni", PrintCallUni>;
1891
1892// Convergent call instructions.  These are identical to regular calls, except
1893// they have the isConvergent bit set.
1894let isConvergent=1 in {
1895  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
1896  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
1897}
1898
1899def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
1900def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
1901def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
1902def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
1903def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
1904def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
1905def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
1906def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
1907def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
1908def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
1909def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
1910def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
1911def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
1912def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
1913def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
1914def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
1915
1916def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
1917def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
1918
1919def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
1920def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
1921def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
1922def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
1923def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
1924def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
1925
1926def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
1927def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
1928def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
1929
1930def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
1931def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
1932def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
1933def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
1934def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
1935
1936def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
1937def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
1938def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
1939def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
1940def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
1941def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
1942def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
1943def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
1944def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
1945def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
1946def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
1947
1948def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
1949def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
1950def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
1951def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
1952def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
1953
1954def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
1955def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
1956def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
1957def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
1958
1959class CallArgInst<NVPTXRegClass regclass> :
1960  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
1961            [(CallArg (i32 0), regclass:$a)]>;
1962
1963class LastCallArgInst<NVPTXRegClass regclass> :
1964  NVPTXInst<(outs), (ins regclass:$a), "$a",
1965            [(LastCallArg (i32 0), regclass:$a)]>;
1966
1967def CallArgI64     : CallArgInst<Int64Regs>;
1968def CallArgI32     : CallArgInst<Int32Regs>;
1969def CallArgI16     : CallArgInst<Int16Regs>;
1970def CallArgF64     : CallArgInst<Float64Regs>;
1971def CallArgF32     : CallArgInst<Float32Regs>;
1972
1973def LastCallArgI64 : LastCallArgInst<Int64Regs>;
1974def LastCallArgI32 : LastCallArgInst<Int32Regs>;
1975def LastCallArgI16 : LastCallArgInst<Int16Regs>;
1976def LastCallArgF64 : LastCallArgInst<Float64Regs>;
1977def LastCallArgF32 : LastCallArgInst<Float32Regs>;
1978
1979def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
1980                              [(CallArg (i32 0), (i32 imm:$a))]>;
1981def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
1982                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
1983
1984def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
1985                             [(CallArg (i32 1), (i32 imm:$a))]>;
1986def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
1987                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
1988
1989def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
1990                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
1991def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
1992                                  [(CallVoid Int32Regs:$addr)]>;
1993def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
1994                                  [(CallVoid Int64Regs:$addr)]>;
1995def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
1996                                  [(Prototype (i32 imm:$val))]>;
1997
1998def DeclareRetMemInst :
1999  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2000            ".param .align $align .b8 retval$num[$size];",
2001            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2002def DeclareRetScalarInst :
2003  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2004            ".param .b$size retval$num;",
2005            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2006def DeclareRetRegInst :
2007  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2008            ".reg .b$size retval$num;",
2009            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2010
2011def DeclareParamInst :
2012  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2013            ".param .align $align .b8 param$a[$size];",
2014            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2015def DeclareScalarParamInst :
2016  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2017            ".param .b$size param$a;",
2018            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2019def DeclareScalarRegInst :
2020  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2021            ".reg .b$size param$a;",
2022            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2023
2024class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
2025  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2026            !strconcat("mov", asmstr, "\t$dst, $src;"),
2027            [(set regclass:$dst, (MoveParam regclass:$src))]>;
2028
2029def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
2030def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
2031def MoveParamI16 :
2032  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2033            "cvt.u16.u32\t$dst, $src;",
2034            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
2035def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
2036def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
2037
2038class PseudoUseParamInst<NVPTXRegClass regclass> :
2039  NVPTXInst<(outs), (ins regclass:$src),
2040            "// Pseudo use of $src",
2041            [(PseudoUseParam regclass:$src)]>;
2042
2043def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
2044def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
2045def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
2046def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
2047def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
2048
2049
2050//
2051// Load / Store Handling
2052//
2053multiclass LD<NVPTXRegClass regclass> {
2054  def _avar : NVPTXInst<
2055    (outs regclass:$dst),
2056    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2057         i32imm:$fromWidth, imem:$addr),
2058    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2059    "\t$dst, [$addr];", []>;
2060  def _areg : NVPTXInst<
2061    (outs regclass:$dst),
2062    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2063         i32imm:$fromWidth, Int32Regs:$addr),
2064    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2065    "\t$dst, [$addr];", []>;
2066  def _areg_64 : NVPTXInst<
2067    (outs regclass:$dst),
2068    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2069         i32imm:$fromWidth, Int64Regs:$addr),
2070    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2071    "\t$dst, [$addr];", []>;
2072  def _ari : NVPTXInst<
2073    (outs regclass:$dst),
2074    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2075         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2076    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2077    "\t$dst, [$addr+$offset];", []>;
2078  def _ari_64 : NVPTXInst<
2079    (outs regclass:$dst),
2080    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2081         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2082    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2083    "\t$dst, [$addr+$offset];", []>;
2084  def _asi : NVPTXInst<
2085    (outs regclass:$dst),
2086    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2087         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2088    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2089    "\t$dst, [$addr+$offset];", []>;
2090}
2091
2092let mayLoad=1, hasSideEffects=0 in {
2093  defm LD_i8  : LD<Int16Regs>;
2094  defm LD_i16 : LD<Int16Regs>;
2095  defm LD_i32 : LD<Int32Regs>;
2096  defm LD_i64 : LD<Int64Regs>;
2097  defm LD_f32 : LD<Float32Regs>;
2098  defm LD_f64 : LD<Float64Regs>;
2099}
2100
2101multiclass ST<NVPTXRegClass regclass> {
2102  def _avar : NVPTXInst<
2103    (outs),
2104    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2105         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2106    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2107    " \t[$addr], $src;", []>;
2108  def _areg : NVPTXInst<
2109    (outs),
2110    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
2111         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2112    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2113    " \t[$addr], $src;", []>;
2114  def _areg_64 : NVPTXInst<
2115    (outs),
2116    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2117         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2118    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2119    " \t[$addr], $src;", []>;
2120  def _ari : NVPTXInst<
2121    (outs),
2122    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2123         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
2124    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2125    " \t[$addr+$offset], $src;", []>;
2126  def _ari_64 : NVPTXInst<
2127    (outs),
2128    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2129         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
2130    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2131    " \t[$addr+$offset], $src;", []>;
2132  def _asi : NVPTXInst<
2133    (outs),
2134    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
2135         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
2136    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2137    " \t[$addr+$offset], $src;", []>;
2138}
2139
2140let mayStore=1, hasSideEffects=0 in {
2141  defm ST_i8  : ST<Int16Regs>;
2142  defm ST_i16 : ST<Int16Regs>;
2143  defm ST_i32 : ST<Int32Regs>;
2144  defm ST_i64 : ST<Int64Regs>;
2145  defm ST_f32 : ST<Float32Regs>;
2146  defm ST_f64 : ST<Float64Regs>;
2147}
2148
2149// The following is used only in and after vector elementizations.  Vector
2150// elementization happens at the machine instruction level, so the following
2151// instructions never appear in the DAG.
2152multiclass LD_VEC<NVPTXRegClass regclass> {
2153  def _v2_avar : NVPTXInst<
2154    (outs regclass:$dst1, regclass:$dst2),
2155    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2156         i32imm:$fromWidth, imem:$addr),
2157    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2158    "\t{{$dst1, $dst2}}, [$addr];", []>;
2159  def _v2_areg : NVPTXInst<
2160    (outs regclass:$dst1, regclass:$dst2),
2161    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2162         i32imm:$fromWidth, Int32Regs:$addr),
2163    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2164    "\t{{$dst1, $dst2}}, [$addr];", []>;
2165  def _v2_areg_64 : NVPTXInst<
2166    (outs regclass:$dst1, regclass:$dst2),
2167    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2168         i32imm:$fromWidth, Int64Regs:$addr),
2169    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2170    "\t{{$dst1, $dst2}}, [$addr];", []>;
2171  def _v2_ari : NVPTXInst<
2172    (outs regclass:$dst1, regclass:$dst2),
2173    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2174         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2175    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2176    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2177  def _v2_ari_64 : NVPTXInst<
2178    (outs regclass:$dst1, regclass:$dst2),
2179    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2180         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2181    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2182    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2183  def _v2_asi : NVPTXInst<
2184    (outs regclass:$dst1, regclass:$dst2),
2185    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2186         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2187    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2188    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
2189  def _v4_avar : NVPTXInst<
2190    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2191    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2192         i32imm:$fromWidth, imem:$addr),
2193    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2194    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2195  def _v4_areg : NVPTXInst<
2196    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2197    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2198         i32imm:$fromWidth, Int32Regs:$addr),
2199    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2200    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2201  def _v4_areg_64 : NVPTXInst<
2202    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2203    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2204         i32imm:$fromWidth, Int64Regs:$addr),
2205    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2206    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2207  def _v4_ari : NVPTXInst<
2208    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2209    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2210         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2211    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2212    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2213  def _v4_ari_64 : NVPTXInst<
2214    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2215    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2216         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2217    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2218    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2219  def _v4_asi : NVPTXInst<
2220    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2221    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2222         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2223    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2224    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
2225}
2226let mayLoad=1, hasSideEffects=0 in {
2227  defm LDV_i8  : LD_VEC<Int16Regs>;
2228  defm LDV_i16 : LD_VEC<Int16Regs>;
2229  defm LDV_i32 : LD_VEC<Int32Regs>;
2230  defm LDV_i64 : LD_VEC<Int64Regs>;
2231  defm LDV_f32 : LD_VEC<Float32Regs>;
2232  defm LDV_f64 : LD_VEC<Float64Regs>;
2233}
2234
2235multiclass ST_VEC<NVPTXRegClass regclass> {
2236  def _v2_avar : NVPTXInst<
2237    (outs),
2238    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2239         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2240    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2241    "\t[$addr], {{$src1, $src2}};", []>;
2242  def _v2_areg : NVPTXInst<
2243    (outs),
2244    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2245         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2246    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2247    "\t[$addr], {{$src1, $src2}};", []>;
2248  def _v2_areg_64 : NVPTXInst<
2249    (outs),
2250    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2251         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
2252    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2253    "\t[$addr], {{$src1, $src2}};", []>;
2254  def _v2_ari : NVPTXInst<
2255    (outs),
2256    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2257         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
2258         i32imm:$offset),
2259    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2260    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2261  def _v2_ari_64 : NVPTXInst<
2262    (outs),
2263    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2264         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
2265         i32imm:$offset),
2266    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2267    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2268  def _v2_asi : NVPTXInst<
2269    (outs),
2270    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
2271         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
2272         i32imm:$offset),
2273    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2274    "\t[$addr+$offset], {{$src1, $src2}};", []>;
2275  def _v4_avar : NVPTXInst<
2276    (outs),
2277    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2278         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2279         i32imm:$fromWidth, imem:$addr),
2280    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2281    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2282  def _v4_areg : NVPTXInst<
2283    (outs),
2284    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2285         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2286         i32imm:$fromWidth, Int32Regs:$addr),
2287    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2288    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2289  def _v4_areg_64 : NVPTXInst<
2290    (outs),
2291    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2292         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2293         i32imm:$fromWidth, Int64Regs:$addr),
2294    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2295    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2296  def _v4_ari : NVPTXInst<
2297    (outs),
2298    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2299         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2300         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
2301    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2302    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2303  def _v4_ari_64 : NVPTXInst<
2304    (outs),
2305    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2306         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2307         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
2308    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2309    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2310  def _v4_asi : NVPTXInst<
2311    (outs),
2312    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2313         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2314         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
2315    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
2316    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
2317}
2318
2319let mayStore=1, hasSideEffects=0 in {
2320  defm STV_i8  : ST_VEC<Int16Regs>;
2321  defm STV_i16 : ST_VEC<Int16Regs>;
2322  defm STV_i32 : ST_VEC<Int32Regs>;
2323  defm STV_i64 : ST_VEC<Int64Regs>;
2324  defm STV_f32 : ST_VEC<Float32Regs>;
2325  defm STV_f64 : ST_VEC<Float64Regs>;
2326}
2327
2328
2329//---- Conversion ----
2330
2331class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
2332  NVPTXRegClass regclassOut> :
2333           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
2334           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
2335     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
2336
2337def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
2338def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
2339def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
2340def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
2341
2342// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
2343// we cannot specify floating-point literals in isel patterns.  Therefore, we
2344// use an integer selp to select either 1 or 0 and then cvt to floating-point.
2345
2346// sint -> f32
2347def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
2348          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2349def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
2350          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
2351def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
2352          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
2353def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
2354          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
2355
2356// uint -> f32
2357def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
2358          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2359def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
2360          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
2361def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
2362          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
2363def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
2364          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
2365
2366// sint -> f64
2367def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
2368          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2369def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
2370          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
2371def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
2372          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
2373def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
2374          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
2375
2376// uint -> f64
2377def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
2378          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
2379def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
2380          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
2381def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
2382          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
2383def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
2384          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
2385
2386
2387// f32 -> sint
2388def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
2389          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2390def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2391          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2392def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
2393          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
2394def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2395          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2396def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
2397          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
2398def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2399          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2400def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
2401          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
2402
2403// f32 -> uint
2404def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
2405          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
2406def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2407          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2408def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
2409          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
2410def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2411          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2412def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
2413          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
2414def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2415          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
2416def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
2417          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
2418
2419// f64 -> sint
2420def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
2421          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2422def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
2423          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
2424def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
2425          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
2426def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
2427          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
2428
2429// f64 -> uint
2430def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
2431          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
2432def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
2433          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
2434def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
2435          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
2436def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
2437          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
2438
2439// sext i1
2440def : Pat<(i16 (sext Int1Regs:$a)),
2441          (SELP_s16ii -1, 0, Int1Regs:$a)>;
2442def : Pat<(i32 (sext Int1Regs:$a)),
2443          (SELP_s32ii -1, 0, Int1Regs:$a)>;
2444def : Pat<(i64 (sext Int1Regs:$a)),
2445          (SELP_s64ii -1, 0, Int1Regs:$a)>;
2446
2447// zext i1
2448def : Pat<(i16 (zext Int1Regs:$a)),
2449          (SELP_u16ii 1, 0, Int1Regs:$a)>;
2450def : Pat<(i32 (zext Int1Regs:$a)),
2451          (SELP_u32ii 1, 0, Int1Regs:$a)>;
2452def : Pat<(i64 (zext Int1Regs:$a)),
2453          (SELP_u64ii 1, 0, Int1Regs:$a)>;
2454
2455// anyext i1
2456def : Pat<(i16 (anyext Int1Regs:$a)),
2457          (SELP_u16ii -1, 0, Int1Regs:$a)>;
2458def : Pat<(i32 (anyext Int1Regs:$a)),
2459          (SELP_u32ii -1, 0, Int1Regs:$a)>;
2460def : Pat<(i64 (anyext Int1Regs:$a)),
2461          (SELP_u64ii -1, 0, Int1Regs:$a)>;
2462
2463// sext i16
2464def : Pat<(i32 (sext Int16Regs:$a)),
2465          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
2466def : Pat<(i64 (sext Int16Regs:$a)),
2467          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
2468
2469// zext i16
2470def : Pat<(i32 (zext Int16Regs:$a)),
2471          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2472def : Pat<(i64 (zext Int16Regs:$a)),
2473          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2474
2475// anyext i16
2476def : Pat<(i32 (anyext Int16Regs:$a)),
2477          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
2478def : Pat<(i64 (anyext Int16Regs:$a)),
2479          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
2480
2481// sext i32
2482def : Pat<(i64 (sext Int32Regs:$a)),
2483          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
2484
2485// zext i32
2486def : Pat<(i64 (zext Int32Regs:$a)),
2487          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2488
2489// anyext i32
2490def : Pat<(i64 (anyext Int32Regs:$a)),
2491          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
2492
2493
2494// truncate i64
2495def : Pat<(i32 (trunc Int64Regs:$a)),
2496          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
2497def : Pat<(i16 (trunc Int64Regs:$a)),
2498          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
2499def : Pat<(i1 (trunc Int64Regs:$a)),
2500          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
2501
2502// truncate i32
2503def : Pat<(i16 (trunc Int32Regs:$a)),
2504          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
2505def : Pat<(i1 (trunc Int32Regs:$a)),
2506          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
2507
2508// truncate i16
2509def : Pat<(i1 (trunc Int16Regs:$a)),
2510          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
2511
2512// sext_inreg
2513def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
2514def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
2515def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
2516def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
2517def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
2518def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
2519
2520
2521// Select instructions with 32-bit predicates
2522def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
2523          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
2524          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2525def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
2526          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
2527          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2528def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
2529          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
2530          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2531def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
2532          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
2533          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2534def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
2535          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
2536          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
2537
2538
2539let hasSideEffects = 0 in {
2540  // pack a set of smaller int registers to a larger int register
2541  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
2542                             (ins Int16Regs:$s1, Int16Regs:$s2,
2543                                  Int16Regs:$s3, Int16Regs:$s4),
2544                             "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>;
2545  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
2546                             (ins Int16Regs:$s1, Int16Regs:$s2),
2547                             "mov.b32\t$d, {{$s1, $s2}};", []>;
2548  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
2549                             (ins Int32Regs:$s1, Int32Regs:$s2),
2550                             "mov.b64\t$d, {{$s1, $s2}};", []>;
2551  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
2552                             (ins Float32Regs:$s1, Float32Regs:$s2),
2553                             "mov.b64\t$d, {{$s1, $s2}};", []>;
2554
2555  // unpack a larger int register to a set of smaller int registers
2556  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
2557                                   Int16Regs:$d3, Int16Regs:$d4),
2558                             (ins Int64Regs:$s),
2559                             "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>;
2560  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
2561                             (ins Int32Regs:$s),
2562                             "mov.b32\t{{$d1, $d2}}, $s;", []>;
2563  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
2564                             (ins Int64Regs:$s),
2565                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
2566  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
2567                             (ins Float64Regs:$s),
2568                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
2569}
2570
2571// Count leading zeros
2572let hasSideEffects = 0 in {
2573  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2574                         "clz.b32\t$d, $a;", []>;
2575  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2576                         "clz.b64\t$d, $a;", []>;
2577}
2578
2579// 32-bit has a direct PTX instruction
2580def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
2581
2582// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
2583// to 64-bit to match the LLVM semantics
2584def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
2585
2586// For 16-bit, we zero-extend to 32-bit, then trunc the result back
2587// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
2588// than 16 bits to store). We also need to subtract 16 because the
2589// high-order 16 zeros were counted.
2590def : Pat<(ctlz Int16Regs:$a),
2591          (SUBi16ri (CVT_u16_u32 (CLZr32
2592            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
2593           CvtNONE), 16)>;
2594
2595// Population count
2596let hasSideEffects = 0 in {
2597  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
2598                          "popc.b32\t$d, $a;", []>;
2599  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
2600                          "popc.b64\t$d, $a;", []>;
2601}
2602
2603// 32-bit has a direct PTX instruction
2604def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
2605
2606// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
2607// to 64-bit to match the LLVM semantics
2608def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
2609
2610// For 16-bit, we zero-extend to 32-bit, then trunc the result back
2611// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
2612// than 16 bits to store)
2613def : Pat<(ctpop Int16Regs:$a),
2614          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
2615
2616// fround f64 -> f32
2617def : Pat<(f32 (fround Float64Regs:$a)),
2618          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
2619def : Pat<(f32 (fround Float64Regs:$a)),
2620          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
2621
2622// fextend f32 -> f64
2623def : Pat<(f64 (fextend Float32Regs:$a)),
2624          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
2625def : Pat<(f64 (fextend Float32Regs:$a)),
2626          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
2627
2628def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
2629                     [SDNPHasChain, SDNPOptInGlue]>;
2630
2631//-----------------------------------
2632// Control-flow
2633//-----------------------------------
2634
2635let isTerminator=1 in {
2636   let isReturn=1, isBarrier=1 in
2637      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
2638
2639   let isBranch=1 in
2640      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
2641                              "@$a bra \t$target;",
2642                              [(brcond Int1Regs:$a, bb:$target)]>;
2643   let isBranch=1 in
2644      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
2645                                   "@!$a bra \t$target;", []>;
2646
2647   let isBranch=1, isBarrier=1 in
2648      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
2649                           "bra.uni \t$target;", [(br bb:$target)]>;
2650}
2651
2652def : Pat<(brcond Int32Regs:$a, bb:$target),
2653          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
2654
2655// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
2656// conditional branch if the target block is the next block so that the code
2657// can fall through to the target block.  The invertion is done by 'xor
2658// condition, 1', which will be translated to (setne condition, -1).  Since ptx
2659// supports '@!pred bra target', we should use it.
2660def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
2661          (CBranchOther Int1Regs:$a, bb:$target)>;
2662
2663// Call
2664def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
2665def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
2666
2667def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
2668                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
2669def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
2670                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
2671                            SDNPSideEffect]>;
2672
2673def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
2674def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
2675                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
2676def calltarget : Operand<i32>;
2677let isCall=1 in {
2678   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
2679}
2680
2681def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
2682def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
2683
2684// Pseudo instructions.
2685class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
2686   : NVPTXInst<outs, ins, asmstr, pattern>;
2687
2688def Callseq_Start :
2689  NVPTXInst<(outs), (ins i32imm:$amt),
2690            "\\{ // callseq $amt\n"
2691            "\t.reg .b32 temp_param_reg;",
2692           [(callseq_start timm:$amt)]>;
2693def Callseq_End :
2694  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
2695            "\\} // callseq $amt1",
2696            [(callseq_end timm:$amt1, timm:$amt2)]>;
2697
2698// trap instruction
2699def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
2700
2701// Call prototype wrapper
2702def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2703def CallPrototype :
2704  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
2705         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2706def ProtoIdent : Operand<i32> {
2707  let PrintMethod = "printProtoIdent";
2708}
2709def CALL_PROTOTYPE :
2710  NVPTXInst<(outs), (ins ProtoIdent:$ident),
2711            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
2712
2713
2714include "NVPTXIntrinsics.td"
2715
2716
2717//-----------------------------------
2718// Notes
2719//-----------------------------------
2720// BSWAP is currently expanded. The following is a more efficient
2721// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
2722// - for sm_20, use pmpt (use vector scalar mov to get the pack and
2723//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
2724// register.
2725