• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the X86 SSE instruction set, defining the instructions,
11// and properties of the instructions which are needed for code generation,
12// machine code emission, and analysis.
13//
14//===----------------------------------------------------------------------===//
15
16
17//===----------------------------------------------------------------------===//
18// SSE 1 & 2 Instructions Classes
19//===----------------------------------------------------------------------===//
20
21/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
22multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
23                           RegisterClass RC, X86MemOperand x86memop,
24                           bit Is2Addr = 1> {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>;
31  }
32  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
33       !if(Is2Addr,
34           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
35           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
36       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))]>;
37}
38
39/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
40multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
41                             string asm, string SSEVer, string FPSizeStr,
42                             Operand memopr, ComplexPattern mem_cpat,
43                             bit Is2Addr = 1> {
44  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
45       !if(Is2Addr,
46           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
47           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
48       [(set RC:$dst, (!cast<Intrinsic>(
49                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
50             RC:$src1, RC:$src2))]>;
51  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
52       !if(Is2Addr,
53           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
54           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
55       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
56                                          SSEVer, "_", OpcodeStr, FPSizeStr))
57             RC:$src1, mem_cpat:$src2))]>;
58}
59
60/// sse12_fp_packed - SSE 1 & 2 packed instructions class
61multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
62                           RegisterClass RC, ValueType vt,
63                           X86MemOperand x86memop, PatFrag mem_frag,
64                           Domain d, bit Is2Addr = 1> {
65  let isCommutable = 1 in
66    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
67       !if(Is2Addr,
68           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
69           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
70       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>;
71  let mayLoad = 1 in
72    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], d>;
77}
78
79/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
80multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
81                                      string OpcodeStr, X86MemOperand x86memop,
82                                      list<dag> pat_rr, list<dag> pat_rm,
83                                      bit Is2Addr = 1> {
84  let isCommutable = 1 in
85    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
86       !if(Is2Addr,
87           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
88           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
89       pat_rr, d>;
90  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
91       !if(Is2Addr,
92           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
93           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
94       pat_rm, d>;
95}
96
97/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class
98multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
99                           string asm, string SSEVer, string FPSizeStr,
100                           X86MemOperand x86memop, PatFrag mem_frag,
101                           Domain d, bit Is2Addr = 1> {
102  def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
103       !if(Is2Addr,
104           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106           [(set RC:$dst, (!cast<Intrinsic>(
107                     !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
108                 RC:$src1, RC:$src2))], d>;
109  def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2),
110       !if(Is2Addr,
111           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
112           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
113       [(set RC:$dst, (!cast<Intrinsic>(
114                     !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
115             RC:$src1, (mem_frag addr:$src2)))], d>;
116}
117
118//===----------------------------------------------------------------------===//
119//  Non-instruction patterns
120//===----------------------------------------------------------------------===//
121
122// A vector extract of the first f32/f64 position is a subregister copy
123def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
124          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
125def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
126          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
127
128// A 128-bit subvector extract from the first 256-bit vector position
129// is a subregister copy that needs no instruction.
130def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))),
131          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
132def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))),
133          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
134
135def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))),
136          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
137def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))),
138          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
139
140def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))),
141          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
142def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))),
143          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
144
145// A 128-bit subvector insert to the first 256-bit vector position
146// is a subregister copy that needs no instruction.
147def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
148          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
149def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
150          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
151def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
152          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
153def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
154          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
155def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
156          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
157def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
158          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
159
160// Implicitly promote a 32-bit scalar to a vector.
161def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
162          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
163def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
164          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
165// Implicitly promote a 64-bit scalar to a vector.
166def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
167          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
168def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
169          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
170
171// Bitcasts between 128-bit vector types. Return the original type since
172// no instruction is needed for the conversion
173let Predicates = [HasXMMInt] in {
174  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
175  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
176  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
177  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
178  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
179  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
180  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
181  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
182  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
183  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
184  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
185  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
186  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
187  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
188  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
189  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
190  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
191  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
192  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
193  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
194  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
195  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
196  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
197  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
198  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
199  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
200  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
201  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
202  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
203  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
204}
205
206// Bitcasts between 256-bit vector types. Return the original type since
207// no instruction is needed for the conversion
208let Predicates = [HasAVX] in {
209  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
210  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
211  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
212  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
213  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
214  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
215  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
216  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
217  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
218  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
219  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
220  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
221  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
222  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
223  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
224  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
225  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
226  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
227  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
228  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
229  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
230  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
231  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
232  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
233  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
234  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
235  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
236  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
237  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
238  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
239}
240
241// Alias instructions that map fld0 to pxor for sse.
242// This is expanded by ExpandPostRAPseudos.
243let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
244    isPseudo = 1 in {
245  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
246                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasXMM]>;
247  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
248                   [(set FR64:$dst, fpimm0)]>, Requires<[HasXMMInt]>;
249}
250
251//===----------------------------------------------------------------------===//
252// AVX & SSE - Zero/One Vectors
253//===----------------------------------------------------------------------===//
254
255// Alias instruction that maps zero vector to pxor / xorp* for sse.
256// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
257// swizzled by ExecutionDepsFix to pxor.
258// We set canFoldAsLoad because this can be converted to a constant-pool
259// load of an all-zeros value if folding it would be beneficial.
260let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
261    isPseudo = 1, neverHasSideEffects = 1 in {
262def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>;
263}
264
265def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
266def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
267def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
268def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
269def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
270def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
271
272
273// The same as done above but for AVX.  The 256-bit ISA does not support PI,
274// and doesn't need it because on sandy bridge the register is set to zero
275// at the rename stage without using any execution unit, so SET0PSY
276// and SET0PDY can be used for vector int instructions without penalty
277// FIXME: Change encoding to pseudo! This is blocked right now by the x86
278// JIT implementatioan, it does not expand the instructions below like
279// X86MCInstLower does.
280let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
281    isCodeGenOnly = 1, Predicates = [HasAVX] in {
282def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
283                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
284def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
285                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
286}
287
288
289// AVX has no support for 256-bit integer instructions, but since the 128-bit
290// VPXOR instruction writes zero to its upper part, it's safe build zeros.
291def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
292def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
293          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
294
295def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
296def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
297          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
298
299// We set canFoldAsLoad because this can be converted to a constant-pool
300// load of an all-ones value if folding it would be beneficial.
301// FIXME: Change encoding to pseudo! This is blocked right now by the x86
302// JIT implementation, it does not expand the instructions below like
303// X86MCInstLower does.
304let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
305    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
306  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
307                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
308let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
309    isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in
310  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
311                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
312
313
314//===----------------------------------------------------------------------===//
315// SSE 1 & 2 - Move FP Scalar Instructions
316//
317// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
318// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
319// is used instead. Register-to-register movss/movsd is not modeled as an
320// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
321// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
322//===----------------------------------------------------------------------===//
323
324class sse12_move_rr<RegisterClass RC, ValueType vt, string asm> :
325      SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm,
326      [(set (vt VR128:$dst), (movl VR128:$src1, (scalar_to_vector RC:$src2)))]>;
327
328// Loading from memory automatically zeroing upper bits.
329class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
330                    PatFrag mem_pat, string OpcodeStr> :
331      SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
332         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
333                        [(set RC:$dst, (mem_pat addr:$src))]>;
334
335// AVX
336def VMOVSSrr : sse12_move_rr<FR32, v4f32,
337                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
338                VEX_LIG;
339def VMOVSDrr : sse12_move_rr<FR64, v2f64,
340                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
341                VEX_LIG;
342
343// For the disassembler
344let isCodeGenOnly = 1 in {
345  def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
346                        (ins VR128:$src1, FR32:$src2),
347                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
348                        XS, VEX_4V, VEX_LIG;
349  def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
350                        (ins VR128:$src1, FR64:$src2),
351                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
352                        XD, VEX_4V, VEX_LIG;
353}
354
355let canFoldAsLoad = 1, isReMaterializable = 1 in {
356  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX,
357                 VEX_LIG;
358  let AddedComplexity = 20 in
359    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX,
360                   VEX_LIG;
361}
362
363def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
364                  "movss\t{$src, $dst|$dst, $src}",
365                  [(store FR32:$src, addr:$dst)]>, XS, VEX, VEX_LIG;
366def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
367                  "movsd\t{$src, $dst|$dst, $src}",
368                  [(store FR64:$src, addr:$dst)]>, XD, VEX, VEX_LIG;
369
370// SSE1 & 2
371let Constraints = "$src1 = $dst" in {
372  def MOVSSrr : sse12_move_rr<FR32, v4f32,
373                          "movss\t{$src2, $dst|$dst, $src2}">, XS;
374  def MOVSDrr : sse12_move_rr<FR64, v2f64,
375                          "movsd\t{$src2, $dst|$dst, $src2}">, XD;
376
377  // For the disassembler
378  let isCodeGenOnly = 1 in {
379    def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
380                         (ins VR128:$src1, FR32:$src2),
381                         "movss\t{$src2, $dst|$dst, $src2}", []>, XS;
382    def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
383                         (ins VR128:$src1, FR64:$src2),
384                         "movsd\t{$src2, $dst|$dst, $src2}", []>, XD;
385  }
386}
387
388let canFoldAsLoad = 1, isReMaterializable = 1 in {
389  def MOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
390
391  let AddedComplexity = 20 in
392    def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
393}
394
395def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
396                  "movss\t{$src, $dst|$dst, $src}",
397                  [(store FR32:$src, addr:$dst)]>;
398def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
399                  "movsd\t{$src, $dst|$dst, $src}",
400                  [(store FR64:$src, addr:$dst)]>;
401
402// Patterns
403let Predicates = [HasSSE1] in {
404  let AddedComplexity = 15 in {
405  // Extract the low 32-bit value from one vector and insert it into another.
406  def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
407            (MOVSSrr (v4f32 VR128:$src1),
408                     (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
409  def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
410            (MOVSSrr (v4i32 VR128:$src1),
411                     (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
412
413  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
414  // MOVSS to the lower bits.
415  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
416            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
417  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
418            (MOVSSrr (v4f32 (V_SET0)),
419                     (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
420  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
421            (MOVSSrr (v4i32 (V_SET0)),
422                     (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
423  }
424
425  let AddedComplexity = 20 in {
426  // MOVSSrm zeros the high parts of the register; represent this
427  // with SUBREG_TO_REG.
428  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
429            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
430  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
431            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
432  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
433            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
434  }
435
436  // Extract and store.
437  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
438                   addr:$dst),
439            (MOVSSmr addr:$dst,
440                     (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
441
442  // Shuffle with MOVSS
443  def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
444            (MOVSSrr VR128:$src1, FR32:$src2)>;
445  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
446            (MOVSSrr (v4i32 VR128:$src1),
447                     (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
448  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
449            (MOVSSrr (v4f32 VR128:$src1),
450                     (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
451}
452
453let Predicates = [HasSSE2] in {
454  let AddedComplexity = 15 in {
455  // Extract the low 64-bit value from one vector and insert it into another.
456  def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
457            (MOVSDrr (v2f64 VR128:$src1),
458                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
459  def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
460            (MOVSDrr (v2i64 VR128:$src1),
461                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
462
463  // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
464  def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
465            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
466  def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
467            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
468
469  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
470  // MOVSD to the lower bits.
471  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
472            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
473  }
474
475  let AddedComplexity = 20 in {
476  // MOVSDrm zeros the high parts of the register; represent this
477  // with SUBREG_TO_REG.
478  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
479            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
480  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
481            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
482  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
483            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
484  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
485            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
486  def : Pat<(v2f64 (X86vzload addr:$src)),
487            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
488  }
489
490  // Extract and store.
491  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
492                   addr:$dst),
493            (MOVSDmr addr:$dst,
494                     (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
495
496  // Shuffle with MOVSD
497  def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
498            (MOVSDrr VR128:$src1, FR64:$src2)>;
499  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
500            (MOVSDrr (v2i64 VR128:$src1),
501                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
502  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
503            (MOVSDrr (v2f64 VR128:$src1),
504                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
505  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
506            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
507  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
508            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
509
510  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
511  // is during lowering, where it's not possible to recognize the fold cause
512  // it has two uses through a bitcast. One use disappears at isel time and the
513  // fold opportunity reappears.
514  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
515            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
516  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
517            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
518}
519
520let Predicates = [HasAVX] in {
521  let AddedComplexity = 15 in {
522  // Extract the low 32-bit value from one vector and insert it into another.
523  def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
524            (VMOVSSrr (v4f32 VR128:$src1),
525                      (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
526  def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
527            (VMOVSSrr (v4i32 VR128:$src1),
528                      (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
529
530  // Extract the low 64-bit value from one vector and insert it into another.
531  def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
532            (VMOVSDrr (v2f64 VR128:$src1),
533                      (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
534  def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
535            (VMOVSDrr (v2i64 VR128:$src1),
536                      (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
537
538  // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
539  def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
540            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
541  def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
542            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
543
544  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
545  // MOVS{S,D} to the lower bits.
546  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
547            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
548  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
549            (VMOVSSrr (v4f32 (V_SET0)),
550                      (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
551  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
552            (VMOVSSrr (v4i32 (V_SET0)),
553                      (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
554  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
555            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
556  }
557
558  let AddedComplexity = 20 in {
559  // MOVSSrm zeros the high parts of the register; represent this
560  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
561  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
562            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
563  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
564            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
565  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
566            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
567
568  // MOVSDrm zeros the high parts of the register; represent this
569  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
570  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
571            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
572  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
573            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
574  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
575            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
576  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
577            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
578  def : Pat<(v2f64 (X86vzload addr:$src)),
579            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
580
581  // Represent the same patterns above but in the form they appear for
582  // 256-bit types
583  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
584                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
585            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
586  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
587                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
588            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>;
589  }
590  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
591                   (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
592            (SUBREG_TO_REG (i32 0),
593                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
594                           sub_xmm)>;
595  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
596                   (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))),
597            (SUBREG_TO_REG (i64 0),
598                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
599                           sub_xmm)>;
600
601  // Extract and store.
602  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
603                   addr:$dst),
604            (VMOVSSmr addr:$dst,
605                     (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
606  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
607                   addr:$dst),
608            (VMOVSDmr addr:$dst,
609                     (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
610
611  // Shuffle with VMOVSS
612  def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
613            (VMOVSSrr VR128:$src1, FR32:$src2)>;
614  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
615            (VMOVSSrr (v4i32 VR128:$src1),
616                      (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
617  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
618            (VMOVSSrr (v4f32 VR128:$src1),
619                      (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
620
621  // Shuffle with VMOVSD
622  def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
623            (VMOVSDrr VR128:$src1, FR64:$src2)>;
624  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
625            (VMOVSDrr (v2i64 VR128:$src1),
626                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
627  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
628            (VMOVSDrr (v2f64 VR128:$src1),
629                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
630  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
631            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
632                                                   sub_sd))>;
633  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
634            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
635                                                   sub_sd))>;
636
637  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
638  // is during lowering, where it's not possible to recognize the fold cause
639  // it has two uses through a bitcast. One use disappears at isel time and the
640  // fold opportunity reappears.
641  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
642            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
643                                                   sub_sd))>;
644  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
645            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
646                                                   sub_sd))>;
647}
648
649//===----------------------------------------------------------------------===//
650// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
651//===----------------------------------------------------------------------===//
652
653multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
654                            X86MemOperand x86memop, PatFrag ld_frag,
655                            string asm, Domain d,
656                            bit IsReMaterializable = 1> {
657let neverHasSideEffects = 1 in
658  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
659              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>;
660let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
661  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
662              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
663                   [(set RC:$dst, (ld_frag addr:$src))], d>;
664}
665
666defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
667                              "movaps", SSEPackedSingle>, TB, VEX;
668defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
669                              "movapd", SSEPackedDouble>, TB, OpSize, VEX;
670defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
671                              "movups", SSEPackedSingle>, TB, VEX;
672defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
673                              "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX;
674
675defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
676                              "movaps", SSEPackedSingle>, TB, VEX;
677defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
678                              "movapd", SSEPackedDouble>, TB, OpSize, VEX;
679defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
680                              "movups", SSEPackedSingle>, TB, VEX;
681defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
682                              "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX;
683defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
684                              "movaps", SSEPackedSingle>, TB;
685defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
686                              "movapd", SSEPackedDouble>, TB, OpSize;
687defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
688                              "movups", SSEPackedSingle>, TB;
689defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
690                              "movupd", SSEPackedDouble, 0>, TB, OpSize;
691
692def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
693                   "movaps\t{$src, $dst|$dst, $src}",
694                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX;
695def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
696                   "movapd\t{$src, $dst|$dst, $src}",
697                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, VEX;
698def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
699                   "movups\t{$src, $dst|$dst, $src}",
700                   [(store (v4f32 VR128:$src), addr:$dst)]>, VEX;
701def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
702                   "movupd\t{$src, $dst|$dst, $src}",
703                   [(store (v2f64 VR128:$src), addr:$dst)]>, VEX;
704def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
705                   "movaps\t{$src, $dst|$dst, $src}",
706                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)]>, VEX;
707def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
708                   "movapd\t{$src, $dst|$dst, $src}",
709                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)]>, VEX;
710def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
711                   "movups\t{$src, $dst|$dst, $src}",
712                   [(store (v8f32 VR256:$src), addr:$dst)]>, VEX;
713def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
714                   "movupd\t{$src, $dst|$dst, $src}",
715                   [(store (v4f64 VR256:$src), addr:$dst)]>, VEX;
716
717// For disassembler
718let isCodeGenOnly = 1 in {
719  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
720                          (ins VR128:$src),
721                          "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
722  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
723                           (ins VR128:$src),
724                           "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
725  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
726                           (ins VR128:$src),
727                           "movups\t{$src, $dst|$dst, $src}", []>, VEX;
728  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
729                           (ins VR128:$src),
730                           "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
731  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
732                            (ins VR256:$src),
733                            "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
734  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
735                            (ins VR256:$src),
736                            "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
737  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
738                            (ins VR256:$src),
739                            "movups\t{$src, $dst|$dst, $src}", []>, VEX;
740  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
741                            (ins VR256:$src),
742                            "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
743}
744
745def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>;
746def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
747          (VMOVUPSYmr addr:$dst, VR256:$src)>;
748
749def : Pat<(int_x86_avx_loadu_pd_256 addr:$src), (VMOVUPDYrm addr:$src)>;
750def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
751          (VMOVUPDYmr addr:$dst, VR256:$src)>;
752
753def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
754                   "movaps\t{$src, $dst|$dst, $src}",
755                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
756def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
757                   "movapd\t{$src, $dst|$dst, $src}",
758                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
759def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
760                   "movups\t{$src, $dst|$dst, $src}",
761                   [(store (v4f32 VR128:$src), addr:$dst)]>;
762def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
763                   "movupd\t{$src, $dst|$dst, $src}",
764                   [(store (v2f64 VR128:$src), addr:$dst)]>;
765
766// For disassembler
767let isCodeGenOnly = 1 in {
768  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
769                         "movaps\t{$src, $dst|$dst, $src}", []>;
770  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
771                         "movapd\t{$src, $dst|$dst, $src}", []>;
772  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
773                         "movups\t{$src, $dst|$dst, $src}", []>;
774  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
775                         "movupd\t{$src, $dst|$dst, $src}", []>;
776}
777
778let Predicates = [HasAVX] in {
779  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
780            (VMOVUPSmr addr:$dst, VR128:$src)>;
781  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
782            (VMOVUPDmr addr:$dst, VR128:$src)>;
783}
784
785let Predicates = [HasSSE1] in
786  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
787            (MOVUPSmr addr:$dst, VR128:$src)>;
788let Predicates = [HasSSE2] in
789  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
790            (MOVUPDmr addr:$dst, VR128:$src)>;
791
792// Use movaps / movups for SSE integer load / store (one byte shorter).
793// The instructions selected below are then converted to MOVDQA/MOVDQU
794// during the SSE domain pass.
795let Predicates = [HasSSE1] in {
796  def : Pat<(alignedloadv4i32 addr:$src),
797            (MOVAPSrm addr:$src)>;
798  def : Pat<(loadv4i32 addr:$src),
799            (MOVUPSrm addr:$src)>;
800  def : Pat<(alignedloadv2i64 addr:$src),
801            (MOVAPSrm addr:$src)>;
802  def : Pat<(loadv2i64 addr:$src),
803            (MOVUPSrm addr:$src)>;
804
805  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
806            (MOVAPSmr addr:$dst, VR128:$src)>;
807  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
808            (MOVAPSmr addr:$dst, VR128:$src)>;
809  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
810            (MOVAPSmr addr:$dst, VR128:$src)>;
811  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
812            (MOVAPSmr addr:$dst, VR128:$src)>;
813  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
814            (MOVUPSmr addr:$dst, VR128:$src)>;
815  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
816            (MOVUPSmr addr:$dst, VR128:$src)>;
817  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
818            (MOVUPSmr addr:$dst, VR128:$src)>;
819  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
820            (MOVUPSmr addr:$dst, VR128:$src)>;
821}
822
823// Use vmovaps/vmovups for AVX integer load/store.
824let Predicates = [HasAVX] in {
825  // 128-bit load/store
826  def : Pat<(alignedloadv4i32 addr:$src),
827            (VMOVAPSrm addr:$src)>;
828  def : Pat<(loadv4i32 addr:$src),
829            (VMOVUPSrm addr:$src)>;
830  def : Pat<(alignedloadv2i64 addr:$src),
831            (VMOVAPSrm addr:$src)>;
832  def : Pat<(loadv2i64 addr:$src),
833            (VMOVUPSrm addr:$src)>;
834
835  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
836            (VMOVAPSmr addr:$dst, VR128:$src)>;
837  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
838            (VMOVAPSmr addr:$dst, VR128:$src)>;
839  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
840            (VMOVAPSmr addr:$dst, VR128:$src)>;
841  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
842            (VMOVAPSmr addr:$dst, VR128:$src)>;
843  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
844            (VMOVUPSmr addr:$dst, VR128:$src)>;
845  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
846            (VMOVUPSmr addr:$dst, VR128:$src)>;
847  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
848            (VMOVUPSmr addr:$dst, VR128:$src)>;
849  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
850            (VMOVUPSmr addr:$dst, VR128:$src)>;
851
852  // 256-bit load/store
853  def : Pat<(alignedloadv4i64 addr:$src),
854            (VMOVAPSYrm addr:$src)>;
855  def : Pat<(loadv4i64 addr:$src),
856            (VMOVUPSYrm addr:$src)>;
857  def : Pat<(alignedloadv8i32 addr:$src),
858            (VMOVAPSYrm addr:$src)>;
859  def : Pat<(loadv8i32 addr:$src),
860            (VMOVUPSYrm addr:$src)>;
861  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
862            (VMOVAPSYmr addr:$dst, VR256:$src)>;
863  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
864            (VMOVAPSYmr addr:$dst, VR256:$src)>;
865  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
866            (VMOVAPSYmr addr:$dst, VR256:$src)>;
867  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
868            (VMOVAPSYmr addr:$dst, VR256:$src)>;
869  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
870            (VMOVUPSYmr addr:$dst, VR256:$src)>;
871  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
872            (VMOVUPSYmr addr:$dst, VR256:$src)>;
873  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
874            (VMOVUPSYmr addr:$dst, VR256:$src)>;
875  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
876            (VMOVUPSYmr addr:$dst, VR256:$src)>;
877}
878
879// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
880// bits are disregarded. FIXME: Set encoding to pseudo!
881let neverHasSideEffects = 1 in {
882def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
883                     "movaps\t{$src, $dst|$dst, $src}", []>;
884def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
885                     "movapd\t{$src, $dst|$dst, $src}", []>;
886def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
887                       "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
888def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
889                       "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
890}
891
892// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
893// bits are disregarded. FIXME: Set encoding to pseudo!
894let canFoldAsLoad = 1, isReMaterializable = 1 in {
895def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
896                     "movaps\t{$src, $dst|$dst, $src}",
897                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
898def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
899                     "movapd\t{$src, $dst|$dst, $src}",
900                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
901let isCodeGenOnly = 1 in {
902  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
903                         "movaps\t{$src, $dst|$dst, $src}",
904                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>, VEX;
905  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
906                         "movapd\t{$src, $dst|$dst, $src}",
907                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>, VEX;
908}
909}
910
911//===----------------------------------------------------------------------===//
912// SSE 1 & 2 - Move Low packed FP Instructions
913//===----------------------------------------------------------------------===//
914
915multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
916                                 PatFrag mov_frag, string base_opc,
917                                 string asm_opr> {
918  def PSrm : PI<opc, MRMSrcMem,
919         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
920         !strconcat(base_opc, "s", asm_opr),
921     [(set RC:$dst,
922       (mov_frag RC:$src1,
923              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
924              SSEPackedSingle>, TB;
925
926  def PDrm : PI<opc, MRMSrcMem,
927         (outs RC:$dst), (ins RC:$src1, f64mem:$src2),
928         !strconcat(base_opc, "d", asm_opr),
929     [(set RC:$dst, (v2f64 (mov_frag RC:$src1,
930                              (scalar_to_vector (loadf64 addr:$src2)))))],
931              SSEPackedDouble>, TB, OpSize;
932}
933
934let AddedComplexity = 20 in {
935  defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
936                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
937}
938let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
939  defm MOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
940                                   "\t{$src2, $dst|$dst, $src2}">;
941}
942
943def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
944                   "movlps\t{$src, $dst|$dst, $src}",
945                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
946                                 (iPTR 0))), addr:$dst)]>, VEX;
947def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
948                   "movlpd\t{$src, $dst|$dst, $src}",
949                   [(store (f64 (vector_extract (v2f64 VR128:$src),
950                                 (iPTR 0))), addr:$dst)]>, VEX;
951def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
952                   "movlps\t{$src, $dst|$dst, $src}",
953                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
954                                 (iPTR 0))), addr:$dst)]>;
955def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
956                   "movlpd\t{$src, $dst|$dst, $src}",
957                   [(store (f64 (vector_extract (v2f64 VR128:$src),
958                                 (iPTR 0))), addr:$dst)]>;
959
960let Predicates = [HasAVX] in {
961  let AddedComplexity = 20 in {
962    // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
963    def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
964              (VMOVLPSrm VR128:$src1, addr:$src2)>;
965    def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
966              (VMOVLPSrm VR128:$src1, addr:$src2)>;
967    // vector_shuffle v1, (load v2) <2, 1> using MOVLPS
968    def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
969              (VMOVLPDrm VR128:$src1, addr:$src2)>;
970    def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
971              (VMOVLPDrm VR128:$src1, addr:$src2)>;
972  }
973
974  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
975  def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
976            (VMOVLPSmr addr:$src1, VR128:$src2)>;
977  def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
978                                 VR128:$src2)), addr:$src1),
979            (VMOVLPSmr addr:$src1, VR128:$src2)>;
980
981  // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS
982  def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
983            (VMOVLPDmr addr:$src1, VR128:$src2)>;
984  def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
985            (VMOVLPDmr addr:$src1, VR128:$src2)>;
986
987  // Shuffle with VMOVLPS
988  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
989            (VMOVLPSrm VR128:$src1, addr:$src2)>;
990  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
991            (VMOVLPSrm VR128:$src1, addr:$src2)>;
992  def : Pat<(X86Movlps VR128:$src1,
993                      (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
994            (VMOVLPSrm VR128:$src1, addr:$src2)>;
995
996  // Shuffle with VMOVLPD
997  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
998            (VMOVLPDrm VR128:$src1, addr:$src2)>;
999  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1000            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1001  def : Pat<(v2f64 (X86Movlpd VR128:$src1,
1002                              (scalar_to_vector (loadf64 addr:$src2)))),
1003            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1004
1005  // Store patterns
1006  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1007                   addr:$src1),
1008            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1009  def : Pat<(store (v4i32 (X86Movlps
1010                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
1011            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1012  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1013                   addr:$src1),
1014            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1015  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1016                   addr:$src1),
1017            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1018}
1019
1020let Predicates = [HasSSE1] in {
1021  let AddedComplexity = 20 in {
1022    // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
1023    def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
1024              (MOVLPSrm VR128:$src1, addr:$src2)>;
1025    def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
1026              (MOVLPSrm VR128:$src1, addr:$src2)>;
1027  }
1028
1029  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
1030  def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
1031            (MOVLPSmr addr:$src1, VR128:$src2)>;
1032  def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
1033                                 VR128:$src2)), addr:$src1),
1034            (MOVLPSmr addr:$src1, VR128:$src2)>;
1035
1036  // Shuffle with MOVLPS
1037  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1038            (MOVLPSrm VR128:$src1, addr:$src2)>;
1039  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1040            (MOVLPSrm VR128:$src1, addr:$src2)>;
1041  def : Pat<(X86Movlps VR128:$src1,
1042                      (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1043            (MOVLPSrm VR128:$src1, addr:$src2)>;
1044
1045  // Store patterns
1046  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1047                                      addr:$src1),
1048            (MOVLPSmr addr:$src1, VR128:$src2)>;
1049  def : Pat<(store (v4i32 (X86Movlps
1050                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
1051                              addr:$src1),
1052            (MOVLPSmr addr:$src1, VR128:$src2)>;
1053}
1054
1055let Predicates = [HasSSE2] in {
1056  let AddedComplexity = 20 in {
1057    // vector_shuffle v1, (load v2) <2, 1> using MOVLPS
1058    def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
1059              (MOVLPDrm VR128:$src1, addr:$src2)>;
1060    def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
1061              (MOVLPDrm VR128:$src1, addr:$src2)>;
1062  }
1063
1064  // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS
1065  def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
1066            (MOVLPDmr addr:$src1, VR128:$src2)>;
1067  def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
1068            (MOVLPDmr addr:$src1, VR128:$src2)>;
1069
1070  // Shuffle with MOVLPD
1071  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1072            (MOVLPDrm VR128:$src1, addr:$src2)>;
1073  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1074            (MOVLPDrm VR128:$src1, addr:$src2)>;
1075  def : Pat<(v2f64 (X86Movlpd VR128:$src1,
1076                              (scalar_to_vector (loadf64 addr:$src2)))),
1077            (MOVLPDrm VR128:$src1, addr:$src2)>;
1078
1079  // Store patterns
1080  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1081                           addr:$src1),
1082            (MOVLPDmr addr:$src1, VR128:$src2)>;
1083  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1084                           addr:$src1),
1085            (MOVLPDmr addr:$src1, VR128:$src2)>;
1086}
1087
1088//===----------------------------------------------------------------------===//
1089// SSE 1 & 2 - Move Hi packed FP Instructions
1090//===----------------------------------------------------------------------===//
1091
1092let AddedComplexity = 20 in {
1093  defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
1094                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
1095}
1096let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1097  defm MOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
1098                                   "\t{$src2, $dst|$dst, $src2}">;
1099}
1100
1101// v2f64 extract element 1 is always custom lowered to unpack high to low
1102// and extract element 0 so the non-store version isn't too horrible.
1103def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1104                   "movhps\t{$src, $dst|$dst, $src}",
1105                   [(store (f64 (vector_extract
1106                                 (unpckh (bc_v2f64 (v4f32 VR128:$src)),
1107                                         (undef)), (iPTR 0))), addr:$dst)]>,
1108                   VEX;
1109def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1110                   "movhpd\t{$src, $dst|$dst, $src}",
1111                   [(store (f64 (vector_extract
1112                                 (v2f64 (unpckh VR128:$src, (undef))),
1113                                 (iPTR 0))), addr:$dst)]>,
1114                   VEX;
1115def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1116                   "movhps\t{$src, $dst|$dst, $src}",
1117                   [(store (f64 (vector_extract
1118                                 (unpckh (bc_v2f64 (v4f32 VR128:$src)),
1119                                         (undef)), (iPTR 0))), addr:$dst)]>;
1120def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1121                   "movhpd\t{$src, $dst|$dst, $src}",
1122                   [(store (f64 (vector_extract
1123                                 (v2f64 (unpckh VR128:$src, (undef))),
1124                                 (iPTR 0))), addr:$dst)]>;
1125
1126let Predicates = [HasAVX] in {
1127  // VMOVHPS patterns
1128  def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1129            (VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
1130  def : Pat<(X86Movlhps VR128:$src1,
1131                 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1132            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1133  def : Pat<(X86Movlhps VR128:$src1,
1134                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1135            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1136
1137  // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
1138  // is during lowering, where it's not possible to recognize the load fold cause
1139  // it has two uses through a bitcast. One use disappears at isel time and the
1140  // fold opportunity reappears.
1141  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
1142                      (scalar_to_vector (loadf64 addr:$src2)))),
1143            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1144
1145  // FIXME: This should be matched by a X86Movhpd instead. Same as above
1146  def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
1147                      (scalar_to_vector (loadf64 addr:$src2)))),
1148            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1149
1150  // Store patterns
1151  def : Pat<(store (f64 (vector_extract
1152            (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst),
1153            (VMOVHPSmr addr:$dst, VR128:$src)>;
1154  def : Pat<(store (f64 (vector_extract
1155            (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))), addr:$dst),
1156            (VMOVHPDmr addr:$dst, VR128:$src)>;
1157}
1158
1159let Predicates = [HasSSE1] in {
1160  // MOVHPS patterns
1161  def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1162            (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
1163  def : Pat<(X86Movlhps VR128:$src1,
1164                 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
1165            (MOVHPSrm VR128:$src1, addr:$src2)>;
1166  def : Pat<(X86Movlhps VR128:$src1,
1167                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
1168            (MOVHPSrm VR128:$src1, addr:$src2)>;
1169
1170  // Store patterns
1171  def : Pat<(store (f64 (vector_extract
1172            (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst),
1173            (MOVHPSmr addr:$dst, VR128:$src)>;
1174}
1175
1176let Predicates = [HasSSE2] in {
1177  // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
1178  // is during lowering, where it's not possible to recognize the load fold cause
1179  // it has two uses through a bitcast. One use disappears at isel time and the
1180  // fold opportunity reappears.
1181  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
1182                      (scalar_to_vector (loadf64 addr:$src2)))),
1183            (MOVHPDrm VR128:$src1, addr:$src2)>;
1184
1185  // FIXME: This should be matched by a X86Movhpd instead. Same as above
1186  def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
1187                      (scalar_to_vector (loadf64 addr:$src2)))),
1188            (MOVHPDrm VR128:$src1, addr:$src2)>;
1189
1190  // Store patterns
1191  def : Pat<(store (f64 (vector_extract
1192            (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst),
1193            (MOVHPDmr addr:$dst, VR128:$src)>;
1194}
1195
1196//===----------------------------------------------------------------------===//
1197// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
1198//===----------------------------------------------------------------------===//
1199
1200let AddedComplexity = 20 in {
1201  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
1202                                       (ins VR128:$src1, VR128:$src2),
1203                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1204                      [(set VR128:$dst,
1205                        (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>,
1206                      VEX_4V;
1207  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
1208                                       (ins VR128:$src1, VR128:$src2),
1209                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1210                      [(set VR128:$dst,
1211                        (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>,
1212                      VEX_4V;
1213}
1214let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1215  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1216                                       (ins VR128:$src1, VR128:$src2),
1217                      "movlhps\t{$src2, $dst|$dst, $src2}",
1218                      [(set VR128:$dst,
1219                        (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>;
1220  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1221                                       (ins VR128:$src1, VR128:$src2),
1222                      "movhlps\t{$src2, $dst|$dst, $src2}",
1223                      [(set VR128:$dst,
1224                        (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
1225}
1226
1227let Predicates = [HasAVX] in {
1228  // MOVLHPS patterns
1229  let AddedComplexity = 20 in {
1230    def : Pat<(v4f32 (movddup VR128:$src, (undef))),
1231              (VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
1232    def : Pat<(v2i64 (movddup VR128:$src, (undef))),
1233              (VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
1234
1235    // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
1236    def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
1237              (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1238  }
1239  def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
1240            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1241  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1242            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1243  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1244            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1245
1246  // MOVHLPS patterns
1247  let AddedComplexity = 20 in {
1248    // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
1249    def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
1250              (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1251
1252    // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
1253    def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
1254              (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
1255    def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
1256              (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
1257  }
1258
1259  def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
1260            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1261  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1262            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1263}
1264
1265let Predicates = [HasSSE1] in {
1266  // MOVLHPS patterns
1267  let AddedComplexity = 20 in {
1268    def : Pat<(v4f32 (movddup VR128:$src, (undef))),
1269              (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
1270    def : Pat<(v2i64 (movddup VR128:$src, (undef))),
1271              (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
1272
1273    // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
1274    def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
1275              (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1276  }
1277  def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
1278            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1279  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1280            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1281  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1282            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1283
1284  // MOVHLPS patterns
1285  let AddedComplexity = 20 in {
1286    // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
1287    def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
1288              (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1289
1290    // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
1291    def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
1292              (MOVHLPSrr VR128:$src1, VR128:$src1)>;
1293    def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
1294              (MOVHLPSrr VR128:$src1, VR128:$src1)>;
1295  }
1296
1297  def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
1298            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1299  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1300            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1301}
1302
1303//===----------------------------------------------------------------------===//
1304// SSE 1 & 2 - Conversion Instructions
1305//===----------------------------------------------------------------------===//
1306
1307multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1308                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1309                     string asm> {
1310  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1311                        [(set DstRC:$dst, (OpNode SrcRC:$src))]>;
1312  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1313                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>;
1314}
1315
1316multiclass sse12_cvt_s_np<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1317                          X86MemOperand x86memop, string asm> {
1318  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, []>;
1319  let mayLoad = 1 in
1320  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, []>;
1321}
1322
1323multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1324                         SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1325                         string asm, Domain d> {
1326  def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1327                        [(set DstRC:$dst, (OpNode SrcRC:$src))], d>;
1328  def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1329                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], d>;
1330}
1331
1332multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1333                          X86MemOperand x86memop, string asm> {
1334  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
1335              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
1336  let mayLoad = 1 in
1337  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1338              (ins DstRC:$src1, x86memop:$src),
1339              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
1340}
1341
1342defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1343                                "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
1344                                VEX_LIG;
1345defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1346                                "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
1347                                VEX_W, VEX_LIG;
1348defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1349                                "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX,
1350                                VEX_LIG;
1351defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1352                                "cvttsd2si\t{$src, $dst|$dst, $src}">, XD,
1353                                VEX, VEX_W, VEX_LIG;
1354
1355// The assembler can recognize rr 64-bit instructions by seeing a rxx
1356// register, but the same isn't true when only using memory operands,
1357// provide other assembly "l" and "q" forms to address this explicitly
1358// where appropriate to do so.
1359defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, XS,
1360                                  VEX_4V, VEX_LIG;
1361defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, XS,
1362                                  VEX_4V, VEX_W, VEX_LIG;
1363defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, XD,
1364                                  VEX_4V, VEX_LIG;
1365defm VCVTSI2SDL  : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD,
1366                                  VEX_4V, VEX_LIG;
1367defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD,
1368                                  VEX_4V, VEX_W, VEX_LIG;
1369
1370let Predicates = [HasAVX] in {
1371  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
1372            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1373  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
1374            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
1375  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
1376            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
1377  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
1378            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
1379
1380  def : Pat<(f32 (sint_to_fp GR32:$src)),
1381            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
1382  def : Pat<(f32 (sint_to_fp GR64:$src)),
1383            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
1384  def : Pat<(f64 (sint_to_fp GR32:$src)),
1385            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
1386  def : Pat<(f64 (sint_to_fp GR64:$src)),
1387            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
1388}
1389
1390defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1391                      "cvttss2si\t{$src, $dst|$dst, $src}">, XS;
1392defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1393                      "cvttss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
1394defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1395                      "cvttsd2si\t{$src, $dst|$dst, $src}">, XD;
1396defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1397                      "cvttsd2si{q}\t{$src, $dst|$dst, $src}">, XD, REX_W;
1398defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
1399                      "cvtsi2ss\t{$src, $dst|$dst, $src}">, XS;
1400defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
1401                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
1402defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
1403                      "cvtsi2sd\t{$src, $dst|$dst, $src}">, XD;
1404defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
1405                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}">, XD, REX_W;
1406
1407// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1408// and/or XMM operand(s).
1409
1410multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1411                         Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
1412                         string asm> {
1413  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1414              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1415              [(set DstRC:$dst, (Int SrcRC:$src))]>;
1416  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
1417              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1418              [(set DstRC:$dst, (Int (ld_frag addr:$src)))]>;
1419}
1420
1421multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1422                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
1423                    PatFrag ld_frag, string asm, bit Is2Addr = 1> {
1424  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1425              !if(Is2Addr,
1426                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1427                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1428              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))]>;
1429  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1430              (ins DstRC:$src1, x86memop:$src2),
1431              !if(Is2Addr,
1432                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1433                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1434              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>;
1435}
1436
1437defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1438                      f128mem, load, "cvtsd2si">, XD, VEX;
1439defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1440                      int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">,
1441                      XD, VEX, VEX_W;
1442
1443// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_
1444// Get rid of this hack or rename the intrinsics, there are several
1445// intructions that only match with the intrinsic form, why create duplicates
1446// to let them be recognized by the assembler?
1447defm VCVTSD2SI     : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
1448                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_LIG;
1449defm VCVTSD2SI64   : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
1450                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W,
1451                      VEX_LIG;
1452
1453defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1454                f128mem, load, "cvtsd2si{l}">, XD;
1455defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1456                  f128mem, load, "cvtsd2si{q}">, XD, REX_W;
1457
1458
1459defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1460          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V;
1461defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1462          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V,
1463          VEX_W;
1464defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1465          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V;
1466defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1467          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD,
1468          VEX_4V, VEX_W;
1469
1470let Constraints = "$src1 = $dst" in {
1471  defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1472                        int_x86_sse_cvtsi2ss, i32mem, loadi32,
1473                        "cvtsi2ss">, XS;
1474  defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1475                        int_x86_sse_cvtsi642ss, i64mem, loadi64,
1476                        "cvtsi2ss{q}">, XS, REX_W;
1477  defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1478                        int_x86_sse2_cvtsi2sd, i32mem, loadi32,
1479                        "cvtsi2sd">, XD;
1480  defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1481                        int_x86_sse2_cvtsi642sd, i64mem, loadi64,
1482                        "cvtsi2sd">, XD, REX_W;
1483}
1484
1485/// SSE 1 Only
1486
1487// Aliases for intrinsics
1488defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1489                                    f32mem, load, "cvttss2si">, XS, VEX;
1490defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1491                                    int_x86_sse_cvttss2si64, f32mem, load,
1492                                    "cvttss2si">, XS, VEX, VEX_W;
1493defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1494                                    f128mem, load, "cvttsd2si">, XD, VEX;
1495defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1496                                    int_x86_sse2_cvttsd2si64, f128mem, load,
1497                                    "cvttsd2si">, XD, VEX, VEX_W;
1498defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1499                                    f32mem, load, "cvttss2si">, XS;
1500defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1501                                    int_x86_sse_cvttss2si64, f32mem, load,
1502                                    "cvttss2si{q}">, XS, REX_W;
1503defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1504                                    f128mem, load, "cvttsd2si">, XD;
1505defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1506                                    int_x86_sse2_cvttsd2si64, f128mem, load,
1507                                    "cvttsd2si{q}">, XD, REX_W;
1508
1509let Pattern = []<dag> in {
1510defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
1511                               "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS,
1512                               VEX, VEX_LIG;
1513defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
1514                               "cvtss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
1515                               VEX_W, VEX_LIG;
1516defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
1517                               "cvtdq2ps\t{$src, $dst|$dst, $src}",
1518                               SSEPackedSingle>, TB, VEX;
1519defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
1520                               "cvtdq2ps\t{$src, $dst|$dst, $src}",
1521                               SSEPackedSingle>, TB, VEX;
1522}
1523
1524let Pattern = []<dag> in {
1525defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
1526                          "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS;
1527defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/,
1528                          "cvtss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W;
1529defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
1530                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1531                            SSEPackedSingle>, TB; /* PD SSE3 form is avaiable */
1532}
1533
1534let Predicates = [HasSSE1] in {
1535  def : Pat<(int_x86_sse_cvtss2si VR128:$src),
1536            (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
1537  def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
1538            (CVTSS2SIrm addr:$src)>;
1539  def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
1540            (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
1541  def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
1542            (CVTSS2SI64rm addr:$src)>;
1543}
1544
1545let Predicates = [HasAVX] in {
1546  def : Pat<(int_x86_sse_cvtss2si VR128:$src),
1547            (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
1548  def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
1549            (VCVTSS2SIrm addr:$src)>;
1550  def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
1551            (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
1552  def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
1553            (VCVTSS2SI64rm addr:$src)>;
1554}
1555
1556/// SSE 2 Only
1557
1558// Convert scalar double to scalar single
1559def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1560                       (ins FR64:$src1, FR64:$src2),
1561                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1562                      VEX_4V, VEX_LIG;
1563let mayLoad = 1 in
1564def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1565                       (ins FR64:$src1, f64mem:$src2),
1566                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1567                      []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
1568
1569def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
1570          Requires<[HasAVX]>;
1571
1572def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1573                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1574                      [(set FR32:$dst, (fround FR64:$src))]>;
1575def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1576                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1577                      [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
1578                  Requires<[HasSSE2, OptForSize]>;
1579
1580defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
1581                      int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>,
1582                      XS, VEX_4V;
1583let Constraints = "$src1 = $dst" in
1584defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
1585                      int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS;
1586
1587// Convert scalar single to scalar double
1588// SSE2 instructions with XS prefix
1589def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1590                    (ins FR32:$src1, FR32:$src2),
1591                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1592                    []>, XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG;
1593let mayLoad = 1 in
1594def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1595                    (ins FR32:$src1, f32mem:$src2),
1596                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1597                    []>, XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
1598
1599let Predicates = [HasAVX] in {
1600  def : Pat<(f64 (fextend FR32:$src)),
1601            (VCVTSS2SDrr FR32:$src, FR32:$src)>;
1602  def : Pat<(fextend (loadf32 addr:$src)),
1603            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1604  def : Pat<(extloadf32 addr:$src),
1605            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1606}
1607
1608def : Pat<(extloadf32 addr:$src),
1609          (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (MOVSSrm addr:$src))>,
1610          Requires<[HasAVX, OptForSpeed]>;
1611
1612def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1613                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1614                   [(set FR64:$dst, (fextend FR32:$src))]>, XS,
1615                 Requires<[HasSSE2]>;
1616def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1617                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1618                   [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
1619                 Requires<[HasSSE2, OptForSize]>;
1620
1621// extload f32 -> f64.  This matches load+fextend because we have a hack in
1622// the isel (PreprocessForFPConvert) that can introduce loads after dag
1623// combine.
1624// Since these loads aren't folded into the fextend, we have to match it
1625// explicitly here.
1626def : Pat<(fextend (loadf32 addr:$src)),
1627          (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>;
1628def : Pat<(extloadf32 addr:$src),
1629          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
1630
1631def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
1632                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1633                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1634                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
1635                                       VR128:$src2))]>, XS, VEX_4V,
1636                    Requires<[HasAVX]>;
1637def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
1638                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
1639                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1640                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
1641                                       (load addr:$src2)))]>, XS, VEX_4V,
1642                    Requires<[HasAVX]>;
1643let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1644def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1645                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1646                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1647                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
1648                                       VR128:$src2))]>, XS,
1649                    Requires<[HasSSE2]>;
1650def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1651                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
1652                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1653                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
1654                                       (load addr:$src2)))]>, XS,
1655                    Requires<[HasSSE2]>;
1656}
1657
1658// Convert doubleword to packed single/double fp
1659// SSE2 instructions without OpSize prefix
1660def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1661                       "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1662                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
1663                     TB, VEX, Requires<[HasAVX]>;
1664def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1665                      "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1666                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
1667                                        (bitconvert (memopv2i64 addr:$src))))]>,
1668                     TB, VEX, Requires<[HasAVX]>;
1669def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1670                       "cvtdq2ps\t{$src, $dst|$dst, $src}",
1671                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
1672                     TB, Requires<[HasSSE2]>;
1673def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
1674                      "cvtdq2ps\t{$src, $dst|$dst, $src}",
1675                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
1676                                        (bitconvert (memopv2i64 addr:$src))))]>,
1677                     TB, Requires<[HasSSE2]>;
1678
1679// FIXME: why the non-intrinsic version is described as SSE3?
1680// SSE2 instructions with XS prefix
1681def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1682                       "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1683                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
1684                     XS, VEX, Requires<[HasAVX]>;
1685def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1686                       "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1687                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
1688                                        (bitconvert (memopv2i64 addr:$src))))]>,
1689                     XS, VEX, Requires<[HasAVX]>;
1690def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1691                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1692                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
1693                     XS, Requires<[HasSSE2]>;
1694def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1695                     "cvtdq2pd\t{$src, $dst|$dst, $src}",
1696                     [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
1697                                        (bitconvert (memopv2i64 addr:$src))))]>,
1698                     XS, Requires<[HasSSE2]>;
1699
1700
1701// Convert packed single/double fp to doubleword
1702def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1703                       "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1704def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1705                       "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1706def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1707                        "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1708def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1709                        "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1710def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1711                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
1712def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1713                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
1714
1715def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1716                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1717                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>,
1718                        VEX;
1719def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
1720                         (ins f128mem:$src),
1721                         "cvtps2dq\t{$src, $dst|$dst, $src}",
1722                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
1723                                            (memop addr:$src)))]>, VEX;
1724def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1725                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1726                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
1727def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1728                         "cvtps2dq\t{$src, $dst|$dst, $src}",
1729                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
1730                                            (memop addr:$src)))]>;
1731
1732// SSE2 packed instructions with XD prefix
1733def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1734                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1735                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
1736                     XD, VEX, Requires<[HasAVX]>;
1737def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1738                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1739                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
1740                                          (memop addr:$src)))]>,
1741                     XD, VEX, Requires<[HasAVX]>;
1742def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1743                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
1744                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
1745                     XD, Requires<[HasSSE2]>;
1746def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1747                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
1748                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
1749                                          (memop addr:$src)))]>,
1750                     XD, Requires<[HasSSE2]>;
1751
1752
1753// Convert with truncation packed single/double fp to doubleword
1754// SSE2 packed instructions with XS prefix
1755def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1756                      "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1757let mayLoad = 1 in
1758def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1759                      "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1760def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1761                      "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1762let mayLoad = 1 in
1763def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1764                      "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1765def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1766                      "cvttps2dq\t{$src, $dst|$dst, $src}",
1767                      [(set VR128:$dst,
1768                            (int_x86_sse2_cvttps2dq VR128:$src))]>;
1769def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1770                      "cvttps2dq\t{$src, $dst|$dst, $src}",
1771                      [(set VR128:$dst,
1772                            (int_x86_sse2_cvttps2dq (memop addr:$src)))]>;
1773
1774def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1775                        "vcvttps2dq\t{$src, $dst|$dst, $src}",
1776                        [(set VR128:$dst,
1777                              (int_x86_sse2_cvttps2dq VR128:$src))]>,
1778                      XS, VEX, Requires<[HasAVX]>;
1779def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1780                        "vcvttps2dq\t{$src, $dst|$dst, $src}",
1781                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq
1782                                           (memop addr:$src)))]>,
1783                      XS, VEX, Requires<[HasAVX]>;
1784
1785let Predicates = [HasSSE2] in {
1786  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
1787            (Int_CVTDQ2PSrr VR128:$src)>;
1788  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1789            (CVTTPS2DQrr VR128:$src)>;
1790}
1791
1792let Predicates = [HasAVX] in {
1793  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
1794            (Int_VCVTDQ2PSrr VR128:$src)>;
1795  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
1796            (VCVTTPS2DQrr VR128:$src)>;
1797  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
1798            (VCVTDQ2PSYrr VR256:$src)>;
1799  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
1800            (VCVTTPS2DQYrr VR256:$src)>;
1801}
1802
1803def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1804                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1805                        [(set VR128:$dst,
1806                              (int_x86_sse2_cvttpd2dq VR128:$src))]>, VEX;
1807let isCodeGenOnly = 1 in
1808def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1809                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1810                        [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
1811                                               (memop addr:$src)))]>, VEX;
1812def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1813                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1814                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
1815def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1816                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1817                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
1818                                        (memop addr:$src)))]>;
1819
1820// The assembler can recognize rr 256-bit instructions by seeing a ymm
1821// register, but the same isn't true when using memory operands instead.
1822// Provide other assembly rr and rm forms to address this explicitly.
1823def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1824                          "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
1825
1826// XMM only
1827def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1828                         "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
1829def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1830                         "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
1831
1832// YMM only
1833def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1834                         "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX;
1835def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1836                         "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
1837
1838// Convert packed single to packed double
1839let Predicates = [HasAVX] in {
1840                  // SSE2 instructions without OpSize prefix
1841def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1842                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
1843def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1844                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
1845def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1846                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
1847def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1848                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
1849}
1850def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1851                       "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
1852def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1853                       "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
1854
1855def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1856                       "vcvtps2pd\t{$src, $dst|$dst, $src}",
1857                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
1858                     TB, VEX, Requires<[HasAVX]>;
1859def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1860                       "vcvtps2pd\t{$src, $dst|$dst, $src}",
1861                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
1862                                          (load addr:$src)))]>,
1863                     TB, VEX, Requires<[HasAVX]>;
1864def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1865                       "cvtps2pd\t{$src, $dst|$dst, $src}",
1866                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
1867                     TB, Requires<[HasSSE2]>;
1868def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1869                       "cvtps2pd\t{$src, $dst|$dst, $src}",
1870                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
1871                                          (load addr:$src)))]>,
1872                     TB, Requires<[HasSSE2]>;
1873
1874// Convert packed double to packed single
1875// The assembler can recognize rr 256-bit instructions by seeing a ymm
1876// register, but the same isn't true when using memory operands instead.
1877// Provide other assembly rr and rm forms to address this explicitly.
1878def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1879                       "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX;
1880def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1881                         "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX;
1882
1883// XMM only
1884def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1885                        "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX;
1886def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1887                        "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX;
1888
1889// YMM only
1890def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1891                        "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX;
1892def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1893                        "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
1894def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1895                     "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
1896def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1897                     "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
1898
1899
1900def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1901                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
1902                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
1903def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
1904                         (ins f128mem:$src),
1905                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
1906                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
1907                                            (memop addr:$src)))]>;
1908def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1909                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
1910                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
1911def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1912                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
1913                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
1914                                            (memop addr:$src)))]>;
1915
1916// AVX 256-bit register conversion intrinsics
1917// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
1918// whenever possible to avoid declaring two versions of each one.
1919def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
1920          (VCVTDQ2PSYrr VR256:$src)>;
1921def : Pat<(int_x86_avx_cvtdq2_ps_256 (memopv8i32 addr:$src)),
1922          (VCVTDQ2PSYrm addr:$src)>;
1923
1924def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src),
1925          (VCVTPD2PSYrr VR256:$src)>;
1926def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)),
1927          (VCVTPD2PSYrm addr:$src)>;
1928
1929def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src),
1930          (VCVTPS2DQYrr VR256:$src)>;
1931def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)),
1932          (VCVTPS2DQYrm addr:$src)>;
1933
1934def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src),
1935          (VCVTPS2PDYrr VR128:$src)>;
1936def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)),
1937          (VCVTPS2PDYrm addr:$src)>;
1938
1939def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src),
1940          (VCVTTPD2DQYrr VR256:$src)>;
1941def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)),
1942          (VCVTTPD2DQYrm addr:$src)>;
1943
1944def : Pat<(int_x86_avx_cvtt_ps2dq_256 VR256:$src),
1945          (VCVTTPS2DQYrr VR256:$src)>;
1946def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)),
1947          (VCVTTPS2DQYrm addr:$src)>;
1948
1949// Match fround and fextend for 128/256-bit conversions
1950def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
1951          (VCVTPD2PSYrr VR256:$src)>;
1952def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
1953          (VCVTPD2PSYrm addr:$src)>;
1954
1955def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
1956          (VCVTPS2PDYrr VR128:$src)>;
1957def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
1958          (VCVTPS2PDYrm addr:$src)>;
1959
1960//===----------------------------------------------------------------------===//
1961// SSE 1 & 2 - Compare Instructions
1962//===----------------------------------------------------------------------===//
1963
1964// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1965multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1966                            SDNode OpNode, ValueType VT, PatFrag ld_frag,
1967                            string asm, string asm_alt> {
1968  def rr : SIi8<0xC2, MRMSrcReg,
1969                (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
1970                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>;
1971  def rm : SIi8<0xC2, MRMSrcMem,
1972                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm,
1973                [(set RC:$dst, (OpNode (VT RC:$src1),
1974                                         (ld_frag addr:$src2), imm:$cc))]>;
1975
1976  // Accept explicit immediate argument form instead of comparison code.
1977  let neverHasSideEffects = 1 in {
1978    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
1979                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, []>;
1980    let mayLoad = 1 in
1981    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
1982                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, []>;
1983  }
1984}
1985
1986defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
1987                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1988                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
1989                 XS, VEX_4V, VEX_LIG;
1990defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
1991                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1992                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
1993                 XD, VEX_4V, VEX_LIG;
1994
1995let Constraints = "$src1 = $dst" in {
1996  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
1997                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
1998                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}">,
1999                  XS;
2000  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
2001                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
2002                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}">,
2003                  XD;
2004}
2005
2006multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
2007                         Intrinsic Int, string asm> {
2008  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
2009                      (ins VR128:$src1, VR128:$src, SSECC:$cc), asm,
2010                        [(set VR128:$dst, (Int VR128:$src1,
2011                                               VR128:$src, imm:$cc))]>;
2012  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
2013                      (ins VR128:$src1, f32mem:$src, SSECC:$cc), asm,
2014                        [(set VR128:$dst, (Int VR128:$src1,
2015                                               (load addr:$src), imm:$cc))]>;
2016}
2017
2018// Aliases to match intrinsics which expect XMM operand(s).
2019defm Int_VCMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
2020                     "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">,
2021                     XS, VEX_4V;
2022defm Int_VCMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
2023                     "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">,
2024                     XD, VEX_4V;
2025let Constraints = "$src1 = $dst" in {
2026  defm Int_CMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
2027                       "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS;
2028  defm Int_CMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
2029                       "cmp${cc}sd\t{$src, $dst|$dst, $src}">, XD;
2030}
2031
2032
2033// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
2034multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
2035                            ValueType vt, X86MemOperand x86memop,
2036                            PatFrag ld_frag, string OpcodeStr, Domain d> {
2037  def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2038                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2039                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], d>;
2040  def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
2041                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2042                     [(set EFLAGS, (OpNode (vt RC:$src1),
2043                                           (ld_frag addr:$src2)))], d>;
2044}
2045
2046let Defs = [EFLAGS] in {
2047  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2048                                  "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
2049  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2050                                  "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
2051                                  VEX_LIG;
2052  let Pattern = []<dag> in {
2053    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2054                                    "comiss", SSEPackedSingle>, TB, VEX,
2055                                    VEX_LIG;
2056    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2057                                    "comisd", SSEPackedDouble>, TB, OpSize, VEX,
2058                                    VEX_LIG;
2059  }
2060
2061  defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2062                            load, "ucomiss", SSEPackedSingle>, TB, VEX;
2063  defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2064                            load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
2065
2066  defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
2067                            load, "comiss", SSEPackedSingle>, TB, VEX;
2068  defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
2069                            load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
2070  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2071                                  "ucomiss", SSEPackedSingle>, TB;
2072  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2073                                  "ucomisd", SSEPackedDouble>, TB, OpSize;
2074
2075  let Pattern = []<dag> in {
2076    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2077                                    "comiss", SSEPackedSingle>, TB;
2078    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2079                                    "comisd", SSEPackedDouble>, TB, OpSize;
2080  }
2081
2082  defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2083                              load, "ucomiss", SSEPackedSingle>, TB;
2084  defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2085                              load, "ucomisd", SSEPackedDouble>, TB, OpSize;
2086
2087  defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
2088                                  "comiss", SSEPackedSingle>, TB;
2089  defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
2090                                  "comisd", SSEPackedDouble>, TB, OpSize;
2091} // Defs = [EFLAGS]
2092
2093// sse12_cmp_packed - sse 1 & 2 compared packed instructions
2094multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2095                            Intrinsic Int, string asm, string asm_alt,
2096                            Domain d> {
2097  let isAsmParserOnly = 1 in {
2098    def rri : PIi8<0xC2, MRMSrcReg,
2099               (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
2100               [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], d>;
2101    def rmi : PIi8<0xC2, MRMSrcMem,
2102               (outs RC:$dst), (ins RC:$src1, f128mem:$src2, SSECC:$cc), asm,
2103               [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], d>;
2104  }
2105
2106  // Accept explicit immediate argument form instead of comparison code.
2107  def rri_alt : PIi8<0xC2, MRMSrcReg,
2108             (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
2109             asm_alt, [], d>;
2110  def rmi_alt : PIi8<0xC2, MRMSrcMem,
2111             (outs RC:$dst), (ins RC:$src1, f128mem:$src2, i8imm:$cc),
2112             asm_alt, [], d>;
2113}
2114
2115defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
2116               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2117               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2118               SSEPackedSingle>, TB, VEX_4V;
2119defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
2120               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2121               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2122               SSEPackedDouble>, TB, OpSize, VEX_4V;
2123defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
2124               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2125               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2126               SSEPackedSingle>, TB, VEX_4V;
2127defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
2128               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2129               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2130               SSEPackedDouble>, TB, OpSize, VEX_4V;
2131let Constraints = "$src1 = $dst" in {
2132  defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
2133                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2134                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2135                 SSEPackedSingle>, TB;
2136  defm CMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
2137                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2138                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2139                 SSEPackedDouble>, TB, OpSize;
2140}
2141
2142let Predicates = [HasSSE1] in {
2143def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2144          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2145def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2146          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2147}
2148
2149let Predicates = [HasSSE2] in {
2150def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2151          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2152def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2153          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2154}
2155
2156let Predicates = [HasAVX] in {
2157def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2158          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2159def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2160          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2161def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2162          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2163def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2164          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2165
2166def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
2167          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
2168def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
2169          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
2170def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
2171          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
2172def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
2173          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2174}
2175
2176//===----------------------------------------------------------------------===//
2177// SSE 1 & 2 - Shuffle Instructions
2178//===----------------------------------------------------------------------===//
2179
2180/// sse12_shuffle - sse 1 & 2 shuffle instructions
2181multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2182                         ValueType vt, string asm, PatFrag mem_frag,
2183                         Domain d, bit IsConvertibleToThreeAddress = 0> {
2184  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2185                   (ins RC:$src1, f128mem:$src2, i8imm:$src3), asm,
2186                   [(set RC:$dst, (vt (shufp:$src3
2187                            RC:$src1, (mem_frag addr:$src2))))], d>;
2188  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
2189    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2190                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
2191                   [(set RC:$dst,
2192                            (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>;
2193}
2194
2195defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2196           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2197           memopv4f32, SSEPackedSingle>, TB, VEX_4V;
2198defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2199           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2200           memopv8f32, SSEPackedSingle>, TB, VEX_4V;
2201defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2202           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
2203           memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
2204defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2205           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
2206           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
2207
2208let Constraints = "$src1 = $dst" in {
2209  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2210                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2211                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>,
2212                    TB;
2213  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2214                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2215                    memopv2f64, SSEPackedDouble>, TB, OpSize;
2216}
2217
2218let Predicates = [HasSSE1] in {
2219  def : Pat<(v4f32 (X86Shufps VR128:$src1,
2220                       (memopv4f32 addr:$src2), (i8 imm:$imm))),
2221            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2222  def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2223            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2224  def : Pat<(v4i32 (X86Shufps VR128:$src1,
2225                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2226            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2227  def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2228            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2229  // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
2230  // fall back to this for SSE1)
2231  def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
2232            (SHUFPSrri VR128:$src2, VR128:$src1,
2233                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
2234  // Special unary SHUFPSrri case.
2235  def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
2236            (SHUFPSrri VR128:$src1, VR128:$src1,
2237                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
2238}
2239
2240let Predicates = [HasSSE2] in {
2241  // Special binary v4i32 shuffle cases with SHUFPS.
2242  def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
2243            (SHUFPSrri VR128:$src1, VR128:$src2,
2244                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
2245  def : Pat<(v4i32 (shufp:$src3 VR128:$src1,
2246                                (bc_v4i32 (memopv2i64 addr:$src2)))),
2247            (SHUFPSrmi VR128:$src1, addr:$src2,
2248                      (SHUFFLE_get_shuf_imm VR128:$src3))>;
2249  // Special unary SHUFPDrri cases.
2250  def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
2251            (SHUFPDrri VR128:$src1, VR128:$src1,
2252                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
2253  def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
2254            (SHUFPDrri VR128:$src1, VR128:$src1,
2255                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
2256  // Special binary v2i64 shuffle cases using SHUFPDrri.
2257  def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
2258            (SHUFPDrri VR128:$src1, VR128:$src2,
2259                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
2260  // Generic SHUFPD patterns
2261  def : Pat<(v2f64 (X86Shufps VR128:$src1,
2262                       (memopv2f64 addr:$src2), (i8 imm:$imm))),
2263            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2264  def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2265            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2266  def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2267            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2268}
2269
2270let Predicates = [HasAVX] in {
2271  def : Pat<(v4f32 (X86Shufps VR128:$src1,
2272                       (memopv4f32 addr:$src2), (i8 imm:$imm))),
2273            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2274  def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2275            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2276  def : Pat<(v4i32 (X86Shufps VR128:$src1,
2277                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2278            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2279  def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2280            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2281  // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
2282  // fall back to this for SSE1)
2283  def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
2284            (VSHUFPSrri VR128:$src2, VR128:$src1,
2285                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
2286  // Special unary SHUFPSrri case.
2287  def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
2288            (VSHUFPSrri VR128:$src1, VR128:$src1,
2289                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
2290  // Special binary v4i32 shuffle cases with SHUFPS.
2291  def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
2292            (VSHUFPSrri VR128:$src1, VR128:$src2,
2293                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
2294  def : Pat<(v4i32 (shufp:$src3 VR128:$src1,
2295                                (bc_v4i32 (memopv2i64 addr:$src2)))),
2296            (VSHUFPSrmi VR128:$src1, addr:$src2,
2297                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
2298  // Special unary SHUFPDrri cases.
2299  def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
2300            (VSHUFPDrri VR128:$src1, VR128:$src1,
2301                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
2302  def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
2303            (VSHUFPDrri VR128:$src1, VR128:$src1,
2304                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
2305  // Special binary v2i64 shuffle cases using SHUFPDrri.
2306  def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
2307            (VSHUFPDrri VR128:$src1, VR128:$src2,
2308                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
2309
2310  def : Pat<(v2f64 (X86Shufps VR128:$src1,
2311                       (memopv2f64 addr:$src2), (i8 imm:$imm))),
2312            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2313  def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2314            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2315  def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2316            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2317
2318  // 256-bit patterns
2319  def : Pat<(v8i32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2320            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2321  def : Pat<(v8i32 (X86Shufps VR256:$src1,
2322                      (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
2323            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2324
2325  def : Pat<(v8f32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2326            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2327  def : Pat<(v8f32 (X86Shufps VR256:$src1,
2328                              (memopv8f32 addr:$src2), (i8 imm:$imm))),
2329            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2330
2331  def : Pat<(v4i64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2332            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2333  def : Pat<(v4i64 (X86Shufpd VR256:$src1,
2334                              (memopv4i64 addr:$src2), (i8 imm:$imm))),
2335            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2336
2337  def : Pat<(v4f64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2338            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2339  def : Pat<(v4f64 (X86Shufpd VR256:$src1,
2340                              (memopv4f64 addr:$src2), (i8 imm:$imm))),
2341            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2342}
2343
2344//===----------------------------------------------------------------------===//
2345// SSE 1 & 2 - Unpack Instructions
2346//===----------------------------------------------------------------------===//
2347
2348/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave
2349multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt,
2350                                   PatFrag mem_frag, RegisterClass RC,
2351                                   X86MemOperand x86memop, string asm,
2352                                   Domain d> {
2353    def rr : PI<opc, MRMSrcReg,
2354                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2355                asm, [(set RC:$dst,
2356                           (vt (OpNode RC:$src1, RC:$src2)))], d>;
2357    def rm : PI<opc, MRMSrcMem,
2358                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2359                asm, [(set RC:$dst,
2360                           (vt (OpNode RC:$src1,
2361                                       (mem_frag addr:$src2))))], d>;
2362}
2363
2364let AddedComplexity = 10 in {
2365  defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
2366        VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2367                       SSEPackedSingle>, TB, VEX_4V;
2368  defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
2369        VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2370                       SSEPackedDouble>, TB, OpSize, VEX_4V;
2371  defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
2372        VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2373                       SSEPackedSingle>, TB, VEX_4V;
2374  defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
2375        VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2376                       SSEPackedDouble>, TB, OpSize, VEX_4V;
2377
2378  defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
2379        VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2380                       SSEPackedSingle>, TB, VEX_4V;
2381  defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
2382        VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2383                       SSEPackedDouble>, TB, OpSize, VEX_4V;
2384  defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
2385        VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2386                       SSEPackedSingle>, TB, VEX_4V;
2387  defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
2388        VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2389                       SSEPackedDouble>, TB, OpSize, VEX_4V;
2390
2391  let Constraints = "$src1 = $dst" in {
2392    defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
2393          VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2394                         SSEPackedSingle>, TB;
2395    defm UNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
2396          VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2397                         SSEPackedDouble>, TB, OpSize;
2398    defm UNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
2399          VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2400                         SSEPackedSingle>, TB;
2401    defm UNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
2402          VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2403                         SSEPackedDouble>, TB, OpSize;
2404  } // Constraints = "$src1 = $dst"
2405} // AddedComplexity
2406
2407let Predicates = [HasSSE1] in {
2408  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
2409            (UNPCKLPSrm VR128:$src1, addr:$src2)>;
2410  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
2411            (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
2412  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
2413            (UNPCKHPSrm VR128:$src1, addr:$src2)>;
2414  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
2415            (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
2416}
2417
2418let Predicates = [HasSSE2] in {
2419  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
2420            (UNPCKLPDrm VR128:$src1, addr:$src2)>;
2421  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
2422            (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
2423  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
2424            (UNPCKHPDrm VR128:$src1, addr:$src2)>;
2425  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
2426            (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
2427
2428  // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
2429  // problem is during lowering, where it's not possible to recognize the load
2430  // fold cause it has two uses through a bitcast. One use disappears at isel
2431  // time and the fold opportunity reappears.
2432  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2433            (UNPCKLPDrr VR128:$src, VR128:$src)>;
2434
2435  let AddedComplexity = 10 in
2436  def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
2437            (UNPCKLPDrr VR128:$src, VR128:$src)>;
2438}
2439
2440let Predicates = [HasAVX] in {
2441  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
2442            (VUNPCKLPSrm VR128:$src1, addr:$src2)>;
2443  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
2444            (VUNPCKLPSrr VR128:$src1, VR128:$src2)>;
2445  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
2446            (VUNPCKHPSrm VR128:$src1, addr:$src2)>;
2447  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
2448            (VUNPCKHPSrr VR128:$src1, VR128:$src2)>;
2449
2450  def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))),
2451            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2452  def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
2453            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2454  def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
2455            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2456  def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, (memopv8i32 addr:$src2))),
2457            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2458  def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, (memopv8f32 addr:$src2))),
2459            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2460  def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
2461            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2462  def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, (memopv8i32 addr:$src2))),
2463            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2464  def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
2465            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2466
2467  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
2468            (VUNPCKLPDrm VR128:$src1, addr:$src2)>;
2469  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
2470            (VUNPCKLPDrr VR128:$src1, VR128:$src2)>;
2471  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
2472            (VUNPCKHPDrm VR128:$src1, addr:$src2)>;
2473  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
2474            (VUNPCKHPDrr VR128:$src1, VR128:$src2)>;
2475
2476  def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))),
2477            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2478  def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
2479            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2480  def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, (memopv4i64 addr:$src2))),
2481            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2482  def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
2483            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2484  def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, (memopv4f64 addr:$src2))),
2485            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2486  def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
2487            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2488  def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))),
2489            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2490  def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
2491            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2492
2493  // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
2494  // problem is during lowering, where it's not possible to recognize the load
2495  // fold cause it has two uses through a bitcast. One use disappears at isel
2496  // time and the fold opportunity reappears.
2497  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2498            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
2499  let AddedComplexity = 10 in
2500  def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
2501            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
2502}
2503
2504//===----------------------------------------------------------------------===//
2505// SSE 1 & 2 - Extract Floating-Point Sign mask
2506//===----------------------------------------------------------------------===//
2507
2508/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2509multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
2510                                Domain d> {
2511  def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
2512                !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2513                     [(set GR32:$dst, (Int RC:$src))], d>;
2514  def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
2515                !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, REX_W;
2516}
2517
2518defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
2519                                     SSEPackedSingle>, TB;
2520defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
2521                                     SSEPackedDouble>, TB, OpSize;
2522
2523def : Pat<(i32 (X86fgetsign FR32:$src)),
2524          (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
2525                                       sub_ss))>, Requires<[HasSSE1]>;
2526def : Pat<(i64 (X86fgetsign FR32:$src)),
2527          (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
2528                                       sub_ss))>, Requires<[HasSSE1]>;
2529def : Pat<(i32 (X86fgetsign FR64:$src)),
2530          (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
2531                                       sub_sd))>, Requires<[HasSSE2]>;
2532def : Pat<(i64 (X86fgetsign FR64:$src)),
2533          (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
2534                                       sub_sd))>, Requires<[HasSSE2]>;
2535
2536let Predicates = [HasAVX] in {
2537  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
2538                                        "movmskps", SSEPackedSingle>, TB, VEX;
2539  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
2540                                        "movmskpd", SSEPackedDouble>, TB,
2541                                        OpSize, VEX;
2542  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
2543                                        "movmskps", SSEPackedSingle>, TB, VEX;
2544  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
2545                                        "movmskpd", SSEPackedDouble>, TB,
2546                                        OpSize, VEX;
2547
2548  def : Pat<(i32 (X86fgetsign FR32:$src)),
2549            (VMOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
2550                                          sub_ss))>;
2551  def : Pat<(i64 (X86fgetsign FR32:$src)),
2552            (VMOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
2553                                          sub_ss))>;
2554  def : Pat<(i32 (X86fgetsign FR64:$src)),
2555            (VMOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
2556                                          sub_sd))>;
2557  def : Pat<(i64 (X86fgetsign FR64:$src)),
2558            (VMOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
2559                                          sub_sd))>;
2560
2561  // Assembler Only
2562  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
2563             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX;
2564  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
2565             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB,
2566             OpSize, VEX;
2567  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
2568             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX;
2569  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
2570             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB,
2571             OpSize, VEX;
2572}
2573
2574//===----------------------------------------------------------------------===//
2575// SSE 1 & 2 - Logical Instructions
2576//===----------------------------------------------------------------------===//
2577
2578/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
2579///
2580multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
2581                                       SDNode OpNode> {
2582  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2583              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, TB, VEX_4V;
2584
2585  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2586        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, TB, OpSize, VEX_4V;
2587
2588  let Constraints = "$src1 = $dst" in {
2589    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
2590                f32, f128mem, memopfsf32, SSEPackedSingle>, TB;
2591
2592    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
2593                f64, f128mem, memopfsf64, SSEPackedDouble>, TB, OpSize;
2594  }
2595}
2596
2597// Alias bitwise logical operations using SSE logical ops on packed FP values.
2598let mayLoad = 0 in {
2599  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand>;
2600  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for>;
2601  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor>;
2602}
2603
2604let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
2605  defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef>;
2606
2607/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2608///
2609multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2610                                   SDNode OpNode> {
2611  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
2612  // are all promoted to v2i64, and the patterns are covered by the int
2613  // version. This is needed in SSE only, because v2i64 isn't supported on
2614  // SSE1, but only on SSE2.
2615  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2616       !strconcat(OpcodeStr, "ps"), f128mem, [],
2617       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2618                                 (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
2619
2620  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2621       !strconcat(OpcodeStr, "pd"), f128mem,
2622       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2623                                 (bc_v2i64 (v2f64 VR128:$src2))))],
2624       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2625                                 (memopv2i64 addr:$src2)))], 0>,
2626                                                 TB, OpSize, VEX_4V;
2627  let Constraints = "$src1 = $dst" in {
2628    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2629         !strconcat(OpcodeStr, "ps"), f128mem,
2630         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
2631         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2632                                   (memopv2i64 addr:$src2)))]>, TB;
2633
2634    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2635         !strconcat(OpcodeStr, "pd"), f128mem,
2636         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2637                                   (bc_v2i64 (v2f64 VR128:$src2))))],
2638         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2639                                   (memopv2i64 addr:$src2)))]>, TB, OpSize;
2640  }
2641}
2642
2643/// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms
2644///
2645multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
2646                                     SDNode OpNode> {
2647    defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2648          !strconcat(OpcodeStr, "ps"), f256mem,
2649          [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
2650          [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2651                                    (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V;
2652
2653    defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2654          !strconcat(OpcodeStr, "pd"), f256mem,
2655          [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2656                                    (bc_v4i64 (v4f64 VR256:$src2))))],
2657          [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2658                                    (memopv4i64 addr:$src2)))], 0>,
2659                                    TB, OpSize, VEX_4V;
2660}
2661
2662// AVX 256-bit packed logical ops forms
2663defm VAND  : sse12_fp_packed_logical_y<0x54, "and", and>;
2664defm VOR   : sse12_fp_packed_logical_y<0x56, "or", or>;
2665defm VXOR  : sse12_fp_packed_logical_y<0x57, "xor", xor>;
2666defm VANDN : sse12_fp_packed_logical_y<0x55, "andn", X86andnp>;
2667
2668defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
2669defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
2670defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
2671let isCommutable = 0 in
2672  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
2673
2674//===----------------------------------------------------------------------===//
2675// SSE 1 & 2 - Arithmetic Instructions
2676//===----------------------------------------------------------------------===//
2677
2678/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2679/// vector forms.
2680///
2681/// In addition, we also have a special variant of the scalar form here to
2682/// represent the associated intrinsic operation.  This form is unlike the
2683/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2684/// and leaves the top elements unmodified (therefore these cannot be commuted).
2685///
2686/// These three forms can each be reg+reg or reg+mem.
2687///
2688
2689/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2690/// classes below
2691multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2692                                  bit Is2Addr = 1> {
2693  defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2694                            OpNode, FR32, f32mem, Is2Addr>, XS;
2695  defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2696                            OpNode, FR64, f64mem, Is2Addr>, XD;
2697}
2698
2699multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2700                                   bit Is2Addr = 1> {
2701  let mayLoad = 0 in {
2702  defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2703              v4f32, f128mem, memopv4f32, SSEPackedSingle, Is2Addr>, TB;
2704  defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2705              v2f64, f128mem, memopv2f64, SSEPackedDouble, Is2Addr>, TB, OpSize;
2706  }
2707}
2708
2709multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
2710                                    SDNode OpNode> {
2711  let mayLoad = 0 in {
2712    defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
2713                v8f32, f256mem, memopv8f32, SSEPackedSingle, 0>, TB;
2714    defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
2715                v4f64, f256mem, memopv4f64, SSEPackedDouble, 0>, TB, OpSize;
2716  }
2717}
2718
2719multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2720                                      bit Is2Addr = 1> {
2721  defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
2722     !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, Is2Addr>, XS;
2723  defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
2724     !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, Is2Addr>, XD;
2725}
2726
2727multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr,
2728                                      bit Is2Addr = 1> {
2729  defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128,
2730     !strconcat(OpcodeStr, "ps"), "sse", "_ps", f128mem, memopv4f32,
2731                                              SSEPackedSingle, Is2Addr>, TB;
2732
2733  defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128,
2734     !strconcat(OpcodeStr, "pd"), "sse2", "_pd", f128mem, memopv2f64,
2735                                      SSEPackedDouble, Is2Addr>, TB, OpSize;
2736}
2737
2738multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> {
2739  defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
2740     !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32,
2741      SSEPackedSingle, 0>, TB;
2742
2743  defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
2744     !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64,
2745      SSEPackedDouble, 0>, TB, OpSize;
2746}
2747
2748// Binary Arithmetic instructions
2749defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
2750            basic_sse12_fp_binop_s_int<0x58, "add", 0>, VEX_4V, VEX_LIG;
2751defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
2752            basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
2753defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
2754            basic_sse12_fp_binop_s_int<0x59, "mul", 0>, VEX_4V, VEX_LIG;
2755defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
2756            basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
2757
2758let isCommutable = 0 in {
2759  defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
2760              basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, VEX_4V, VEX_LIG;
2761  defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
2762              basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
2763  defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
2764              basic_sse12_fp_binop_s_int<0x5E, "div", 0>, VEX_4V, VEX_LIG;
2765  defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
2766              basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
2767  defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
2768              basic_sse12_fp_binop_s_int<0x5F, "max", 0>, VEX_4V, VEX_LIG;
2769  defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
2770              basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
2771              basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
2772              basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
2773  defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
2774              basic_sse12_fp_binop_s_int<0x5D, "min", 0>, VEX_4V, VEX_LIG;
2775  defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
2776              basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
2777              basic_sse12_fp_binop_p_y_int<0x5D, "min">,
2778              basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
2779}
2780
2781let Constraints = "$src1 = $dst" in {
2782  defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd>,
2783             basic_sse12_fp_binop_p<0x58, "add", fadd>,
2784             basic_sse12_fp_binop_s_int<0x58, "add">;
2785  defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul>,
2786             basic_sse12_fp_binop_p<0x59, "mul", fmul>,
2787             basic_sse12_fp_binop_s_int<0x59, "mul">;
2788
2789  let isCommutable = 0 in {
2790    defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub>,
2791               basic_sse12_fp_binop_p<0x5C, "sub", fsub>,
2792               basic_sse12_fp_binop_s_int<0x5C, "sub">;
2793    defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv>,
2794               basic_sse12_fp_binop_p<0x5E, "div", fdiv>,
2795               basic_sse12_fp_binop_s_int<0x5E, "div">;
2796    defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax>,
2797               basic_sse12_fp_binop_p<0x5F, "max", X86fmax>,
2798               basic_sse12_fp_binop_s_int<0x5F, "max">,
2799               basic_sse12_fp_binop_p_int<0x5F, "max">;
2800    defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin>,
2801               basic_sse12_fp_binop_p<0x5D, "min", X86fmin>,
2802               basic_sse12_fp_binop_s_int<0x5D, "min">,
2803               basic_sse12_fp_binop_p_int<0x5D, "min">;
2804  }
2805}
2806
2807/// Unop Arithmetic
2808/// In addition, we also have a special variant of the scalar form here to
2809/// represent the associated intrinsic operation.  This form is unlike the
2810/// plain scalar form, in that it takes an entire vector (instead of a
2811/// scalar) and leaves the top elements undefined.
2812///
2813/// And, we have a special variant form for a full-vector intrinsic form.
2814
2815/// sse1_fp_unop_s - SSE1 unops in scalar form.
2816multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
2817                          SDNode OpNode, Intrinsic F32Int> {
2818  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
2819                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
2820                [(set FR32:$dst, (OpNode FR32:$src))]>;
2821  // For scalar unary operations, fold a load into the operation
2822  // only in OptForSize mode. It eliminates an instruction, but it also
2823  // eliminates a whole-register clobber (the load), so it introduces a
2824  // partial register update condition.
2825  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
2826                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
2827                [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS,
2828            Requires<[HasSSE1, OptForSize]>;
2829  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2830                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
2831                    [(set VR128:$dst, (F32Int VR128:$src))]>;
2832  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
2833                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
2834                    [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
2835}
2836
2837/// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form.
2838multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
2839  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
2840                !strconcat(OpcodeStr,
2841                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
2842  let mayLoad = 1 in
2843  def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
2844                !strconcat(OpcodeStr,
2845                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
2846  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
2847                (ins ssmem:$src1, VR128:$src2),
2848                !strconcat(OpcodeStr,
2849                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
2850}
2851
2852/// sse1_fp_unop_p - SSE1 unops in packed form.
2853multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> {
2854  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2855              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2856              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>;
2857  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2858                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2859                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
2860}
2861
2862/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form.
2863multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> {
2864  def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2865              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2866              [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>;
2867  def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2868                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2869                [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))]>;
2870}
2871
2872/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
2873multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
2874                              Intrinsic V4F32Int> {
2875  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2876                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2877                    [(set VR128:$dst, (V4F32Int VR128:$src))]>;
2878  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2879                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2880                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
2881}
2882
2883/// sse1_fp_unop_p_y_int - AVX 256-bit intrinsics unops in packed forms.
2884multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
2885                                Intrinsic V4F32Int> {
2886  def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2887                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2888                    [(set VR256:$dst, (V4F32Int VR256:$src))]>;
2889  def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2890                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2891                    [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))]>;
2892}
2893
2894/// sse2_fp_unop_s - SSE2 unops in scalar form.
2895multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
2896                          SDNode OpNode, Intrinsic F64Int> {
2897  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
2898                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
2899                [(set FR64:$dst, (OpNode FR64:$src))]>;
2900  // See the comments in sse1_fp_unop_s for why this is OptForSize.
2901  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
2902                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
2903                [(set FR64:$dst, (OpNode (load addr:$src)))]>, XD,
2904            Requires<[HasSSE2, OptForSize]>;
2905  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2906                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
2907                    [(set VR128:$dst, (F64Int VR128:$src))]>;
2908  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
2909                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
2910                    [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
2911}
2912
2913/// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
2914multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
2915  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
2916               !strconcat(OpcodeStr,
2917                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
2918  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
2919               !strconcat(OpcodeStr,
2920                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
2921  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
2922               (ins VR128:$src1, sdmem:$src2),
2923               !strconcat(OpcodeStr,
2924                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
2925}
2926
2927/// sse2_fp_unop_p - SSE2 unops in vector forms.
2928multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2929                          SDNode OpNode> {
2930  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2931              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2932              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>;
2933  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2934                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2935                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
2936}
2937
2938/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms.
2939multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> {
2940  def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2941              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2942              [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>;
2943  def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2944                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2945                [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))]>;
2946}
2947
2948/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
2949multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr,
2950                              Intrinsic V2F64Int> {
2951  def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2952                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2953                    [(set VR128:$dst, (V2F64Int VR128:$src))]>;
2954  def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2955                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2956                    [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
2957}
2958
2959/// sse2_fp_unop_p_y_int - AVX 256-bit intrinsic unops in vector forms.
2960multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
2961                                Intrinsic V2F64Int> {
2962  def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2963                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2964                    [(set VR256:$dst, (V2F64Int VR256:$src))]>;
2965  def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2966                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2967                    [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>;
2968}
2969
2970let Predicates = [HasAVX] in {
2971  // Square root.
2972  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt">,
2973                sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG;
2974
2975  defm VSQRT  : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>,
2976                sse2_fp_unop_p<0x51, "vsqrt", fsqrt>,
2977                sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
2978                sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt>,
2979                sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps>,
2980                sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd>,
2981                sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256>,
2982                sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256>,
2983                VEX;
2984
2985  // Reciprocal approximations. Note that these typically require refinement
2986  // in order to obtain suitable precision.
2987  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG;
2988  defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>,
2989                sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>,
2990                sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256>,
2991                sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps>, VEX;
2992
2993  defm VRCP   : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG;
2994  defm VRCP   : sse1_fp_unop_p<0x53, "vrcp", X86frcp>,
2995                sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>,
2996                sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256>,
2997                sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps>, VEX;
2998}
2999
3000def : Pat<(f32 (fsqrt FR32:$src)),
3001          (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3002def : Pat<(f32 (fsqrt (load addr:$src))),
3003          (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3004          Requires<[HasAVX, OptForSize]>;
3005def : Pat<(f64 (fsqrt FR64:$src)),
3006          (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
3007def : Pat<(f64 (fsqrt (load addr:$src))),
3008          (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
3009          Requires<[HasAVX, OptForSize]>;
3010
3011def : Pat<(f32 (X86frsqrt FR32:$src)),
3012          (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3013def : Pat<(f32 (X86frsqrt (load addr:$src))),
3014          (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3015          Requires<[HasAVX, OptForSize]>;
3016
3017def : Pat<(f32 (X86frcp FR32:$src)),
3018          (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3019def : Pat<(f32 (X86frcp (load addr:$src))),
3020          (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3021          Requires<[HasAVX, OptForSize]>;
3022
3023let Predicates = [HasAVX] in {
3024  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
3025            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
3026                (VSQRTSSr (f32 (IMPLICIT_DEF)),
3027                          (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
3028                sub_ss)>;
3029  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
3030            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3031
3032  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
3033            (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
3034                (VSQRTSDr (f64 (IMPLICIT_DEF)),
3035                          (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd)),
3036                sub_sd)>;
3037  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
3038            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
3039
3040  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3041            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
3042                (VRSQRTSSr (f32 (IMPLICIT_DEF)),
3043                          (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
3044                sub_ss)>;
3045  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
3046            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3047
3048  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3049            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
3050                (VRCPSSr (f32 (IMPLICIT_DEF)),
3051                         (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
3052                sub_ss)>;
3053  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
3054            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3055}
3056
3057// Square root.
3058defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss>,
3059             sse1_fp_unop_p<0x51, "sqrt",  fsqrt>,
3060             sse1_fp_unop_p_int<0x51, "sqrt",  int_x86_sse_sqrt_ps>,
3061             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd>,
3062             sse2_fp_unop_p<0x51, "sqrt",  fsqrt>,
3063             sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd>;
3064
3065// Reciprocal approximations. Note that these typically require refinement
3066// in order to obtain suitable precision.
3067defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss>,
3068             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt>,
3069             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps>;
3070defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
3071             sse1_fp_unop_p<0x53, "rcp", X86frcp>,
3072             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps>;
3073
3074// There is no f64 version of the reciprocal approximation instructions.
3075
3076//===----------------------------------------------------------------------===//
3077// SSE 1 & 2 - Non-temporal stores
3078//===----------------------------------------------------------------------===//
3079
3080let AddedComplexity = 400 in { // Prefer non-temporal versions
3081  def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3082                       (ins f128mem:$dst, VR128:$src),
3083                       "movntps\t{$src, $dst|$dst, $src}",
3084                       [(alignednontemporalstore (v4f32 VR128:$src),
3085                                                 addr:$dst)]>, VEX;
3086  def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3087                       (ins f128mem:$dst, VR128:$src),
3088                       "movntpd\t{$src, $dst|$dst, $src}",
3089                       [(alignednontemporalstore (v2f64 VR128:$src),
3090                                                 addr:$dst)]>, VEX;
3091  def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs),
3092                        (ins f128mem:$dst, VR128:$src),
3093                        "movntdq\t{$src, $dst|$dst, $src}",
3094                        [(alignednontemporalstore (v2f64 VR128:$src),
3095                                                  addr:$dst)]>, VEX;
3096
3097  let ExeDomain = SSEPackedInt in
3098  def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3099                           (ins f128mem:$dst, VR128:$src),
3100                           "movntdq\t{$src, $dst|$dst, $src}",
3101                           [(alignednontemporalstore (v4f32 VR128:$src),
3102                                                     addr:$dst)]>, VEX;
3103
3104  def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
3105            (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
3106
3107  def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3108                       (ins f256mem:$dst, VR256:$src),
3109                       "movntps\t{$src, $dst|$dst, $src}",
3110                       [(alignednontemporalstore (v8f32 VR256:$src),
3111                                                 addr:$dst)]>, VEX;
3112  def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3113                       (ins f256mem:$dst, VR256:$src),
3114                       "movntpd\t{$src, $dst|$dst, $src}",
3115                       [(alignednontemporalstore (v4f64 VR256:$src),
3116                                                 addr:$dst)]>, VEX;
3117  def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs),
3118                        (ins f256mem:$dst, VR256:$src),
3119                        "movntdq\t{$src, $dst|$dst, $src}",
3120                        [(alignednontemporalstore (v4f64 VR256:$src),
3121                                                  addr:$dst)]>, VEX;
3122  let ExeDomain = SSEPackedInt in
3123  def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3124                      (ins f256mem:$dst, VR256:$src),
3125                      "movntdq\t{$src, $dst|$dst, $src}",
3126                      [(alignednontemporalstore (v8f32 VR256:$src),
3127                                                addr:$dst)]>, VEX;
3128}
3129
3130def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src),
3131          (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3132def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src),
3133          (VMOVNTPDYmr addr:$dst, VR256:$src)>;
3134def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src),
3135          (VMOVNTPSYmr addr:$dst, VR256:$src)>;
3136
3137let AddedComplexity = 400 in { // Prefer non-temporal versions
3138def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3139                    "movntps\t{$src, $dst|$dst, $src}",
3140                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3141def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3142                    "movntpd\t{$src, $dst|$dst, $src}",
3143                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3144
3145def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3146                    "movntdq\t{$src, $dst|$dst, $src}",
3147                    [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
3148
3149let ExeDomain = SSEPackedInt in
3150def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3151                    "movntdq\t{$src, $dst|$dst, $src}",
3152                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3153
3154def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
3155          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3156
3157// There is no AVX form for instructions below this point
3158def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3159                 "movnti{l}\t{$src, $dst|$dst, $src}",
3160                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3161               TB, Requires<[HasSSE2]>;
3162def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3163                     "movnti{q}\t{$src, $dst|$dst, $src}",
3164                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3165                  TB, Requires<[HasSSE2]>;
3166}
3167
3168//===----------------------------------------------------------------------===//
3169// SSE 1 & 2 - Prefetch and memory fence
3170//===----------------------------------------------------------------------===//
3171
3172// Prefetch intrinsic.
3173def PREFETCHT0   : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
3174    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>;
3175def PREFETCHT1   : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
3176    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>;
3177def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
3178    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>;
3179def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
3180    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>;
3181
3182// Flush cache
3183def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3184               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3185              TB, Requires<[HasSSE2]>;
3186
3187// Pause. This "instruction" is encoded as "rep; nop", so even though it
3188// was introduced with SSE2, it's backward compatible.
3189def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
3190
3191// Load, store, and memory fence
3192def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
3193               "sfence", [(int_x86_sse_sfence)]>, TB, Requires<[HasSSE1]>;
3194def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
3195               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
3196def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
3197               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
3198
3199def : Pat<(X86SFence), (SFENCE)>;
3200def : Pat<(X86LFence), (LFENCE)>;
3201def : Pat<(X86MFence), (MFENCE)>;
3202
3203//===----------------------------------------------------------------------===//
3204// SSE 1 & 2 - Load/Store XCSR register
3205//===----------------------------------------------------------------------===//
3206
3207def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3208                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX;
3209def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3210                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX;
3211
3212def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3213                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
3214def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3215                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
3216
3217//===---------------------------------------------------------------------===//
3218// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3219//===---------------------------------------------------------------------===//
3220
3221let ExeDomain = SSEPackedInt in { // SSE integer instructions
3222
3223let neverHasSideEffects = 1 in {
3224def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3225                    "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
3226def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3227                    "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
3228}
3229def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3230                    "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
3231def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3232                    "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
3233
3234// For Disassembler
3235let isCodeGenOnly = 1 in {
3236def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3237                        "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
3238def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3239                        "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
3240def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3241                        "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
3242def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3243                        "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
3244}
3245
3246let canFoldAsLoad = 1, mayLoad = 1 in {
3247def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3248                   "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
3249def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3250                   "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
3251let Predicates = [HasAVX] in {
3252  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3253                    "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
3254  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3255                    "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
3256}
3257}
3258
3259let mayStore = 1 in {
3260def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3261                     (ins i128mem:$dst, VR128:$src),
3262                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
3263def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3264                     (ins i256mem:$dst, VR256:$src),
3265                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
3266let Predicates = [HasAVX] in {
3267def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3268                  "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
3269def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3270                  "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
3271}
3272}
3273
3274let neverHasSideEffects = 1 in
3275def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3276                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3277
3278def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3279                   "movdqu\t{$src, $dst|$dst, $src}",
3280                   []>, XS, Requires<[HasSSE2]>;
3281
3282// For Disassembler
3283let isCodeGenOnly = 1 in {
3284def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3285                       "movdqa\t{$src, $dst|$dst, $src}", []>;
3286
3287def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3288                       "movdqu\t{$src, $dst|$dst, $src}",
3289                       []>, XS, Requires<[HasSSE2]>;
3290}
3291
3292let canFoldAsLoad = 1, mayLoad = 1 in {
3293def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3294                   "movdqa\t{$src, $dst|$dst, $src}",
3295                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3296def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3297                   "movdqu\t{$src, $dst|$dst, $src}",
3298                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3299                 XS, Requires<[HasSSE2]>;
3300}
3301
3302let mayStore = 1 in {
3303def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3304                   "movdqa\t{$src, $dst|$dst, $src}",
3305                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3306def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3307                   "movdqu\t{$src, $dst|$dst, $src}",
3308                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3309                 XS, Requires<[HasSSE2]>;
3310}
3311
3312// Intrinsic forms of MOVDQU load and store
3313def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3314                       "vmovdqu\t{$src, $dst|$dst, $src}",
3315                       [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
3316                     XS, VEX, Requires<[HasAVX]>;
3317
3318def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3319                       "movdqu\t{$src, $dst|$dst, $src}",
3320                       [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
3321                     XS, Requires<[HasSSE2]>;
3322
3323} // ExeDomain = SSEPackedInt
3324
3325let Predicates = [HasAVX] in {
3326  def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>;
3327  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
3328            (VMOVDQUYmr addr:$dst, VR256:$src)>;
3329}
3330
3331//===---------------------------------------------------------------------===//
3332// SSE2 - Packed Integer Arithmetic Instructions
3333//===---------------------------------------------------------------------===//
3334
3335let ExeDomain = SSEPackedInt in { // SSE integer instructions
3336
3337multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
3338                            bit IsCommutable = 0, bit Is2Addr = 1> {
3339  let isCommutable = IsCommutable in
3340  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
3341       (ins VR128:$src1, VR128:$src2),
3342       !if(Is2Addr,
3343           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3344           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3345       [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
3346  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
3347       (ins VR128:$src1, i128mem:$src2),
3348       !if(Is2Addr,
3349           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3350           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3351       [(set VR128:$dst, (IntId VR128:$src1,
3352                                (bitconvert (memopv2i64 addr:$src2))))]>;
3353}
3354
3355multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
3356                             string OpcodeStr, Intrinsic IntId,
3357                             Intrinsic IntId2, bit Is2Addr = 1> {
3358  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
3359       (ins VR128:$src1, VR128:$src2),
3360       !if(Is2Addr,
3361           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3362           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3363       [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
3364  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
3365       (ins VR128:$src1, i128mem:$src2),
3366       !if(Is2Addr,
3367           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3368           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3369       [(set VR128:$dst, (IntId VR128:$src1,
3370                                      (bitconvert (memopv2i64 addr:$src2))))]>;
3371  def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst),
3372       (ins VR128:$src1, i32i8imm:$src2),
3373       !if(Is2Addr,
3374           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3375           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3376       [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
3377}
3378
3379/// PDI_binop_rm - Simple SSE2 binary operator.
3380multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
3381                        ValueType OpVT, bit IsCommutable = 0, bit Is2Addr = 1> {
3382  let isCommutable = IsCommutable in
3383  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
3384       (ins VR128:$src1, VR128:$src2),
3385       !if(Is2Addr,
3386           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3387           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3388       [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>;
3389  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
3390       (ins VR128:$src1, i128mem:$src2),
3391       !if(Is2Addr,
3392           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3393           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3394       [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
3395                                     (bitconvert (memopv2i64 addr:$src2)))))]>;
3396}
3397
3398/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
3399///
3400/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
3401/// to collapse (bitconvert VT to VT) into its operand.
3402///
3403multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
3404                              bit IsCommutable = 0, bit Is2Addr = 1> {
3405  let isCommutable = IsCommutable in
3406  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
3407       (ins VR128:$src1, VR128:$src2),
3408       !if(Is2Addr,
3409           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3410           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3411       [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]>;
3412  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
3413       (ins VR128:$src1, i128mem:$src2),
3414       !if(Is2Addr,
3415           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3416           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3417       [(set VR128:$dst, (OpNode VR128:$src1, (memopv2i64 addr:$src2)))]>;
3418}
3419
3420} // ExeDomain = SSEPackedInt
3421
3422// 128-bit Integer Arithmetic
3423
3424let Predicates = [HasAVX] in {
3425defm VPADDB  : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V;
3426defm VPADDW  : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V;
3427defm VPADDD  : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V;
3428defm VPADDQ  : PDI_binop_rm_v2i64<0xD4, "vpaddq", add, 1, 0>, VEX_4V;
3429defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, 1, 0>, VEX_4V;
3430defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, 0, 0>, VEX_4V;
3431defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, 0, 0>, VEX_4V;
3432defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, 0, 0>, VEX_4V;
3433defm VPSUBQ : PDI_binop_rm_v2i64<0xFB, "vpsubq", sub, 0, 0>, VEX_4V;
3434
3435// Intrinsic forms
3436defm VPSUBSB  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b, 0, 0>,
3437                                 VEX_4V;
3438defm VPSUBSW  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w, 0, 0>,
3439                                 VEX_4V;
3440defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b, 0, 0>,
3441                                 VEX_4V;
3442defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w, 0, 0>,
3443                                 VEX_4V;
3444defm VPADDSB  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b, 1, 0>,
3445                                 VEX_4V;
3446defm VPADDSW  : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w, 1, 0>,
3447                                 VEX_4V;
3448defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b, 1, 0>,
3449                                 VEX_4V;
3450defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w, 1, 0>,
3451                                 VEX_4V;
3452defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w, 1, 0>,
3453                                 VEX_4V;
3454defm VPMULHW  : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w, 1, 0>,
3455                                 VEX_4V;
3456defm VPMULUDQ : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_sse2_pmulu_dq, 1, 0>,
3457                                 VEX_4V;
3458defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd, 1, 0>,
3459                                 VEX_4V;
3460defm VPAVGB   : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b, 1, 0>,
3461                                 VEX_4V;
3462defm VPAVGW   : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w, 1, 0>,
3463                                 VEX_4V;
3464defm VPMINUB  : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b, 1, 0>,
3465                                 VEX_4V;
3466defm VPMINSW  : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w, 1, 0>,
3467                                 VEX_4V;
3468defm VPMAXUB  : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b, 1, 0>,
3469                                 VEX_4V;
3470defm VPMAXSW  : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w, 1, 0>,
3471                                 VEX_4V;
3472defm VPSADBW  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw, 1, 0>,
3473                                 VEX_4V;
3474}
3475
3476let Constraints = "$src1 = $dst" in {
3477defm PADDB  : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
3478defm PADDW  : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
3479defm PADDD  : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
3480defm PADDQ  : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
3481defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
3482defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
3483defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
3484defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
3485defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
3486
3487// Intrinsic forms
3488defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
3489defm PSUBSW  : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
3490defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
3491defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
3492defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
3493defm PADDSW  : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
3494defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
3495defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
3496defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
3497defm PMULHW  : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, 1>;
3498defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
3499defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
3500defm PAVGB   : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
3501defm PAVGW   : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
3502defm PMINUB  : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
3503defm PMINSW  : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
3504defm PMAXUB  : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
3505defm PMAXSW  : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
3506defm PSADBW  : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
3507
3508} // Constraints = "$src1 = $dst"
3509
3510//===---------------------------------------------------------------------===//
3511// SSE2 - Packed Integer Logical Instructions
3512//===---------------------------------------------------------------------===//
3513
3514let Predicates = [HasAVX] in {
3515defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw",
3516                                int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>,
3517                                VEX_4V;
3518defm VPSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "vpslld",
3519                                int_x86_sse2_psll_d, int_x86_sse2_pslli_d, 0>,
3520                                VEX_4V;
3521defm VPSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "vpsllq",
3522                                int_x86_sse2_psll_q, int_x86_sse2_pslli_q, 0>,
3523                                VEX_4V;
3524
3525defm VPSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "vpsrlw",
3526                                int_x86_sse2_psrl_w, int_x86_sse2_psrli_w, 0>,
3527                                VEX_4V;
3528defm VPSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "vpsrld",
3529                                int_x86_sse2_psrl_d, int_x86_sse2_psrli_d, 0>,
3530                                VEX_4V;
3531defm VPSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "vpsrlq",
3532                                int_x86_sse2_psrl_q, int_x86_sse2_psrli_q, 0>,
3533                                VEX_4V;
3534
3535defm VPSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "vpsraw",
3536                                int_x86_sse2_psra_w, int_x86_sse2_psrai_w, 0>,
3537                                VEX_4V;
3538defm VPSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "vpsrad",
3539                                int_x86_sse2_psra_d, int_x86_sse2_psrai_d, 0>,
3540                                VEX_4V;
3541
3542defm VPAND : PDI_binop_rm_v2i64<0xDB, "vpand", and, 1, 0>, VEX_4V;
3543defm VPOR  : PDI_binop_rm_v2i64<0xEB, "vpor" , or, 1, 0>, VEX_4V;
3544defm VPXOR : PDI_binop_rm_v2i64<0xEF, "vpxor", xor, 1, 0>, VEX_4V;
3545
3546let ExeDomain = SSEPackedInt in {
3547  let neverHasSideEffects = 1 in {
3548    // 128-bit logical shifts.
3549    def VPSLLDQri : PDIi8<0x73, MRM7r,
3550                      (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3551                      "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
3552                      VEX_4V;
3553    def VPSRLDQri : PDIi8<0x73, MRM3r,
3554                      (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3555                      "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
3556                      VEX_4V;
3557    // PSRADQri doesn't exist in SSE[1-3].
3558  }
3559  def VPANDNrr : PDI<0xDF, MRMSrcReg,
3560                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3561                    "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3562                    [(set VR128:$dst,
3563                          (v2i64 (X86andnp VR128:$src1, VR128:$src2)))]>,VEX_4V;
3564
3565  def VPANDNrm : PDI<0xDF, MRMSrcMem,
3566                    (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
3567                    "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3568                    [(set VR128:$dst, (X86andnp VR128:$src1,
3569                                            (memopv2i64 addr:$src2)))]>, VEX_4V;
3570}
3571}
3572
3573let Constraints = "$src1 = $dst" in {
3574defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
3575                               int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
3576defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
3577                               int_x86_sse2_psll_d, int_x86_sse2_pslli_d>;
3578defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
3579                               int_x86_sse2_psll_q, int_x86_sse2_pslli_q>;
3580
3581defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
3582                               int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>;
3583defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
3584                               int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>;
3585defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
3586                               int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>;
3587
3588defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
3589                               int_x86_sse2_psra_w, int_x86_sse2_psrai_w>;
3590defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
3591                               int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
3592
3593defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
3594defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or, 1>;
3595defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
3596
3597let ExeDomain = SSEPackedInt in {
3598  let neverHasSideEffects = 1 in {
3599    // 128-bit logical shifts.
3600    def PSLLDQri : PDIi8<0x73, MRM7r,
3601                         (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3602                         "pslldq\t{$src2, $dst|$dst, $src2}", []>;
3603    def PSRLDQri : PDIi8<0x73, MRM3r,
3604                         (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
3605                         "psrldq\t{$src2, $dst|$dst, $src2}", []>;
3606    // PSRADQri doesn't exist in SSE[1-3].
3607  }
3608  def PANDNrr : PDI<0xDF, MRMSrcReg,
3609                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3610                    "pandn\t{$src2, $dst|$dst, $src2}", []>;
3611
3612  def PANDNrm : PDI<0xDF, MRMSrcMem,
3613                    (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
3614                    "pandn\t{$src2, $dst|$dst, $src2}", []>;
3615}
3616} // Constraints = "$src1 = $dst"
3617
3618let Predicates = [HasAVX] in {
3619  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
3620            (v2i64 (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
3621  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
3622            (v2i64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
3623  def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
3624            (v2i64 (VPSLLDQri VR128:$src1, imm:$src2))>;
3625  def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
3626            (v2i64 (VPSRLDQri VR128:$src1, imm:$src2))>;
3627  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
3628            (v2f64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
3629
3630  // Shift up / down and insert zero's.
3631  def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
3632            (v2i64 (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
3633  def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
3634            (v2i64 (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
3635}
3636
3637let Predicates = [HasSSE2] in {
3638  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
3639            (v2i64 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
3640  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
3641            (v2i64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
3642  def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
3643            (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>;
3644  def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
3645            (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>;
3646  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
3647            (v2f64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
3648
3649  // Shift up / down and insert zero's.
3650  def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
3651            (v2i64 (PSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
3652  def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
3653            (v2i64 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
3654}
3655
3656//===---------------------------------------------------------------------===//
3657// SSE2 - Packed Integer Comparison Instructions
3658//===---------------------------------------------------------------------===//
3659
3660let Predicates = [HasAVX] in {
3661  defm VPCMPEQB  : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1,
3662                                    0>, VEX_4V;
3663  defm VPCMPEQW  : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1,
3664                                    0>, VEX_4V;
3665  defm VPCMPEQD  : PDI_binop_rm_int<0x76, "vpcmpeqd", int_x86_sse2_pcmpeq_d, 1,
3666                                    0>, VEX_4V;
3667  defm VPCMPGTB  : PDI_binop_rm_int<0x64, "vpcmpgtb", int_x86_sse2_pcmpgt_b, 0,
3668                                    0>, VEX_4V;
3669  defm VPCMPGTW  : PDI_binop_rm_int<0x65, "vpcmpgtw", int_x86_sse2_pcmpgt_w, 0,
3670                                    0>, VEX_4V;
3671  defm VPCMPGTD  : PDI_binop_rm_int<0x66, "vpcmpgtd", int_x86_sse2_pcmpgt_d, 0,
3672                                    0>, VEX_4V;
3673
3674  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
3675            (VPCMPEQBrr VR128:$src1, VR128:$src2)>;
3676  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
3677            (VPCMPEQBrm VR128:$src1, addr:$src2)>;
3678  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
3679            (VPCMPEQWrr VR128:$src1, VR128:$src2)>;
3680  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
3681            (VPCMPEQWrm VR128:$src1, addr:$src2)>;
3682  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
3683            (VPCMPEQDrr VR128:$src1, VR128:$src2)>;
3684  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
3685            (VPCMPEQDrm VR128:$src1, addr:$src2)>;
3686
3687  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
3688            (VPCMPGTBrr VR128:$src1, VR128:$src2)>;
3689  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
3690            (VPCMPGTBrm VR128:$src1, addr:$src2)>;
3691  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
3692            (VPCMPGTWrr VR128:$src1, VR128:$src2)>;
3693  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
3694            (VPCMPGTWrm VR128:$src1, addr:$src2)>;
3695  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
3696            (VPCMPGTDrr VR128:$src1, VR128:$src2)>;
3697  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
3698            (VPCMPGTDrm VR128:$src1, addr:$src2)>;
3699}
3700
3701let Constraints = "$src1 = $dst" in {
3702  defm PCMPEQB  : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b, 1>;
3703  defm PCMPEQW  : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w, 1>;
3704  defm PCMPEQD  : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d, 1>;
3705  defm PCMPGTB  : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
3706  defm PCMPGTW  : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
3707  defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
3708} // Constraints = "$src1 = $dst"
3709
3710let Predicates = [HasSSE2] in {
3711  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
3712            (PCMPEQBrr VR128:$src1, VR128:$src2)>;
3713  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
3714            (PCMPEQBrm VR128:$src1, addr:$src2)>;
3715  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
3716            (PCMPEQWrr VR128:$src1, VR128:$src2)>;
3717  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
3718            (PCMPEQWrm VR128:$src1, addr:$src2)>;
3719  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
3720            (PCMPEQDrr VR128:$src1, VR128:$src2)>;
3721  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
3722            (PCMPEQDrm VR128:$src1, addr:$src2)>;
3723
3724  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
3725            (PCMPGTBrr VR128:$src1, VR128:$src2)>;
3726  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
3727            (PCMPGTBrm VR128:$src1, addr:$src2)>;
3728  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
3729            (PCMPGTWrr VR128:$src1, VR128:$src2)>;
3730  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
3731            (PCMPGTWrm VR128:$src1, addr:$src2)>;
3732  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
3733            (PCMPGTDrr VR128:$src1, VR128:$src2)>;
3734  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
3735            (PCMPGTDrm VR128:$src1, addr:$src2)>;
3736}
3737
3738//===---------------------------------------------------------------------===//
3739// SSE2 - Packed Integer Pack Instructions
3740//===---------------------------------------------------------------------===//
3741
3742let Predicates = [HasAVX] in {
3743defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128,
3744                                  0, 0>, VEX_4V;
3745defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128,
3746                                  0, 0>, VEX_4V;
3747defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128,
3748                                  0, 0>, VEX_4V;
3749}
3750
3751let Constraints = "$src1 = $dst" in {
3752defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
3753defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
3754defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
3755} // Constraints = "$src1 = $dst"
3756
3757//===---------------------------------------------------------------------===//
3758// SSE2 - Packed Integer Shuffle Instructions
3759//===---------------------------------------------------------------------===//
3760
3761let ExeDomain = SSEPackedInt in {
3762multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, PatFrag pshuf_frag,
3763                         PatFrag bc_frag> {
3764def ri : Ii8<0x70, MRMSrcReg,
3765              (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
3766              !strconcat(OpcodeStr,
3767                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3768              [(set VR128:$dst, (vt (pshuf_frag:$src2 VR128:$src1,
3769                                                      (undef))))]>;
3770def mi : Ii8<0x70, MRMSrcMem,
3771              (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
3772              !strconcat(OpcodeStr,
3773                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3774              [(set VR128:$dst, (vt (pshuf_frag:$src2
3775                                      (bc_frag (memopv2i64 addr:$src1)),
3776                                      (undef))))]>;
3777}
3778} // ExeDomain = SSEPackedInt
3779
3780let Predicates = [HasAVX] in {
3781  let AddedComplexity = 5 in
3782  defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize,
3783                               VEX;
3784
3785  // SSE2 with ImmT == Imm8 and XS prefix.
3786  defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, pshufhw, bc_v8i16>, XS,
3787                               VEX;
3788
3789  // SSE2 with ImmT == Imm8 and XD prefix.
3790  defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, pshuflw, bc_v8i16>, XD,
3791                               VEX;
3792
3793  let AddedComplexity = 5 in
3794  def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
3795            (VPSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
3796  // Unary v4f32 shuffle with VPSHUF* in order to fold a load.
3797  def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
3798            (VPSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
3799
3800  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
3801                                   (i8 imm:$imm))),
3802            (VPSHUFDmi addr:$src1, imm:$imm)>;
3803  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
3804                                   (i8 imm:$imm))),
3805            (VPSHUFDmi addr:$src1, imm:$imm)>;
3806  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
3807            (VPSHUFDri VR128:$src1, imm:$imm)>;
3808  def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
3809            (VPSHUFDri VR128:$src1, imm:$imm)>;
3810  def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
3811            (VPSHUFHWri VR128:$src, imm:$imm)>;
3812  def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)),
3813                               (i8 imm:$imm))),
3814            (VPSHUFHWmi addr:$src, imm:$imm)>;
3815  def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
3816            (VPSHUFLWri VR128:$src, imm:$imm)>;
3817  def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)),
3818                               (i8 imm:$imm))),
3819            (VPSHUFLWmi addr:$src, imm:$imm)>;
3820}
3821
3822let Predicates = [HasSSE2] in {
3823  let AddedComplexity = 5 in
3824  defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize;
3825
3826  // SSE2 with ImmT == Imm8 and XS prefix.
3827  defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, pshufhw, bc_v8i16>, XS;
3828
3829  // SSE2 with ImmT == Imm8 and XD prefix.
3830  defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, pshuflw, bc_v8i16>, XD;
3831
3832  let AddedComplexity = 5 in
3833  def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
3834            (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
3835  // Unary v4f32 shuffle with PSHUF* in order to fold a load.
3836  def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
3837            (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
3838
3839  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
3840                                   (i8 imm:$imm))),
3841            (PSHUFDmi addr:$src1, imm:$imm)>;
3842  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
3843                                   (i8 imm:$imm))),
3844            (PSHUFDmi addr:$src1, imm:$imm)>;
3845  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
3846            (PSHUFDri VR128:$src1, imm:$imm)>;
3847  def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
3848            (PSHUFDri VR128:$src1, imm:$imm)>;
3849  def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
3850            (PSHUFHWri VR128:$src, imm:$imm)>;
3851  def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)),
3852                               (i8 imm:$imm))),
3853            (PSHUFHWmi addr:$src, imm:$imm)>;
3854  def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
3855            (PSHUFLWri VR128:$src, imm:$imm)>;
3856  def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)),
3857                               (i8 imm:$imm))),
3858            (PSHUFLWmi addr:$src, imm:$imm)>;
3859}
3860
3861//===---------------------------------------------------------------------===//
3862// SSE2 - Packed Integer Unpack Instructions
3863//===---------------------------------------------------------------------===//
3864
3865let ExeDomain = SSEPackedInt in {
3866multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3867                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
3868  def rr : PDI<opc, MRMSrcReg,
3869      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3870      !if(Is2Addr,
3871          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3872          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3873      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))]>;
3874  def rm : PDI<opc, MRMSrcMem,
3875      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
3876      !if(Is2Addr,
3877          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3878          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3879      [(set VR128:$dst, (OpNode VR128:$src1,
3880                                  (bc_frag (memopv2i64
3881                                               addr:$src2))))]>;
3882}
3883
3884let Predicates = [HasAVX] in {
3885  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Punpcklbw,
3886                                 bc_v16i8, 0>, VEX_4V;
3887  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Punpcklwd,
3888                                 bc_v8i16, 0>, VEX_4V;
3889  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Punpckldq,
3890                                 bc_v4i32, 0>, VEX_4V;
3891
3892  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
3893  /// knew to collapse (bitconvert VT to VT) into its operand.
3894  def VPUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
3895            (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3896            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3897            [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1,
3898                                                    VR128:$src2)))]>, VEX_4V;
3899  def VPUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
3900            (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
3901            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3902            [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1,
3903                                        (memopv2i64 addr:$src2))))]>, VEX_4V;
3904
3905  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Punpckhbw,
3906                                 bc_v16i8, 0>, VEX_4V;
3907  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Punpckhwd,
3908                                 bc_v8i16, 0>, VEX_4V;
3909  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Punpckhdq,
3910                                 bc_v4i32, 0>, VEX_4V;
3911
3912  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
3913  /// knew to collapse (bitconvert VT to VT) into its operand.
3914  def VPUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
3915             (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3916             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3917             [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1,
3918                                                     VR128:$src2)))]>, VEX_4V;
3919  def VPUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
3920             (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
3921             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3922             [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1,
3923                                        (memopv2i64 addr:$src2))))]>, VEX_4V;
3924}
3925
3926let Constraints = "$src1 = $dst" in {
3927  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw, bc_v16i8>;
3928  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd, bc_v8i16>;
3929  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq, bc_v4i32>;
3930
3931  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
3932  /// knew to collapse (bitconvert VT to VT) into its operand.
3933  def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
3934                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3935                         "punpcklqdq\t{$src2, $dst|$dst, $src2}",
3936                        [(set VR128:$dst,
3937                          (v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)))]>;
3938  def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
3939                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
3940                         "punpcklqdq\t{$src2, $dst|$dst, $src2}",
3941                        [(set VR128:$dst,
3942                          (v2i64 (X86Punpcklqdq VR128:$src1,
3943                                         (memopv2i64 addr:$src2))))]>;
3944
3945  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw, bc_v16i8>;
3946  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd, bc_v8i16>;
3947  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq, bc_v4i32>;
3948
3949  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
3950  /// knew to collapse (bitconvert VT to VT) into its operand.
3951  def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
3952                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
3953                         "punpckhqdq\t{$src2, $dst|$dst, $src2}",
3954                        [(set VR128:$dst,
3955                          (v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)))]>;
3956  def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
3957                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
3958                        "punpckhqdq\t{$src2, $dst|$dst, $src2}",
3959                        [(set VR128:$dst,
3960                          (v2i64 (X86Punpckhqdq VR128:$src1,
3961                                         (memopv2i64 addr:$src2))))]>;
3962}
3963} // ExeDomain = SSEPackedInt
3964
3965// Splat v2f64 / v2i64
3966let AddedComplexity = 10 in {
3967  def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
3968            (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
3969  def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
3970            (VPUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasAVX]>;
3971}
3972
3973//===---------------------------------------------------------------------===//
3974// SSE2 - Packed Integer Extract and Insert
3975//===---------------------------------------------------------------------===//
3976
3977let ExeDomain = SSEPackedInt in {
3978multiclass sse2_pinsrw<bit Is2Addr = 1> {
3979  def rri : Ii8<0xC4, MRMSrcReg,
3980       (outs VR128:$dst), (ins VR128:$src1,
3981        GR32:$src2, i32i8imm:$src3),
3982       !if(Is2Addr,
3983           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3984           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3985       [(set VR128:$dst,
3986         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
3987  def rmi : Ii8<0xC4, MRMSrcMem,
3988                       (outs VR128:$dst), (ins VR128:$src1,
3989                        i16mem:$src2, i32i8imm:$src3),
3990       !if(Is2Addr,
3991           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3992           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3993       [(set VR128:$dst,
3994         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3995                    imm:$src3))]>;
3996}
3997
3998// Extract
3999let Predicates = [HasAVX] in
4000def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
4001                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
4002                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4003                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
4004                                                imm:$src2))]>, TB, OpSize, VEX;
4005def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
4006                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
4007                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4008                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
4009                                                imm:$src2))]>;
4010
4011// Insert
4012let Predicates = [HasAVX] in {
4013  defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
4014  def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
4015       (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
4016       "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
4017       []>, TB, OpSize, VEX_4V;
4018}
4019
4020let Constraints = "$src1 = $dst" in
4021  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>;
4022
4023} // ExeDomain = SSEPackedInt
4024
4025//===---------------------------------------------------------------------===//
4026// SSE2 - Packed Mask Creation
4027//===---------------------------------------------------------------------===//
4028
4029let ExeDomain = SSEPackedInt in {
4030
4031def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
4032           "pmovmskb\t{$src, $dst|$dst, $src}",
4033           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX;
4034def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
4035           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX;
4036def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
4037           "pmovmskb\t{$src, $dst|$dst, $src}",
4038           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
4039
4040} // ExeDomain = SSEPackedInt
4041
4042//===---------------------------------------------------------------------===//
4043// SSE2 - Conditional Store
4044//===---------------------------------------------------------------------===//
4045
4046let ExeDomain = SSEPackedInt in {
4047
4048let Uses = [EDI] in
4049def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4050           (ins VR128:$src, VR128:$mask),
4051           "maskmovdqu\t{$mask, $src|$src, $mask}",
4052           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, VEX;
4053let Uses = [RDI] in
4054def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4055           (ins VR128:$src, VR128:$mask),
4056           "maskmovdqu\t{$mask, $src|$src, $mask}",
4057           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX;
4058
4059let Uses = [EDI] in
4060def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4061           "maskmovdqu\t{$mask, $src|$src, $mask}",
4062           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4063let Uses = [RDI] in
4064def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4065           "maskmovdqu\t{$mask, $src|$src, $mask}",
4066           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4067
4068} // ExeDomain = SSEPackedInt
4069
4070//===---------------------------------------------------------------------===//
4071// SSE2 - Move Doubleword
4072//===---------------------------------------------------------------------===//
4073
4074//===---------------------------------------------------------------------===//
4075// Move Int Doubleword to Packed Double Int
4076//
4077def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4078                      "movd\t{$src, $dst|$dst, $src}",
4079                      [(set VR128:$dst,
4080                        (v4i32 (scalar_to_vector GR32:$src)))]>, VEX;
4081def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4082                      "movd\t{$src, $dst|$dst, $src}",
4083                      [(set VR128:$dst,
4084                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4085                      VEX;
4086def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4087                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4088                        [(set VR128:$dst,
4089                          (v2i64 (scalar_to_vector GR64:$src)))]>, VEX;
4090def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4091                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4092                       [(set FR64:$dst, (bitconvert GR64:$src))]>, VEX;
4093
4094def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4095                      "movd\t{$src, $dst|$dst, $src}",
4096                      [(set VR128:$dst,
4097                        (v4i32 (scalar_to_vector GR32:$src)))]>;
4098def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4099                      "movd\t{$src, $dst|$dst, $src}",
4100                      [(set VR128:$dst,
4101                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
4102def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4103                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4104                        [(set VR128:$dst,
4105                          (v2i64 (scalar_to_vector GR64:$src)))]>;
4106def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4107                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4108                       [(set FR64:$dst, (bitconvert GR64:$src))]>;
4109
4110//===---------------------------------------------------------------------===//
4111// Move Int Doubleword to Single Scalar
4112//
4113def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4114                      "movd\t{$src, $dst|$dst, $src}",
4115                      [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX;
4116
4117def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4118                      "movd\t{$src, $dst|$dst, $src}",
4119                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
4120                      VEX;
4121def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4122                      "movd\t{$src, $dst|$dst, $src}",
4123                      [(set FR32:$dst, (bitconvert GR32:$src))]>;
4124
4125def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4126                      "movd\t{$src, $dst|$dst, $src}",
4127                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
4128
4129//===---------------------------------------------------------------------===//
4130// Move Packed Doubleword Int to Packed Double Int
4131//
4132def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4133                       "movd\t{$src, $dst|$dst, $src}",
4134                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4135                                        (iPTR 0)))]>, VEX;
4136def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
4137                       (ins i32mem:$dst, VR128:$src),
4138                       "movd\t{$src, $dst|$dst, $src}",
4139                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4140                                     (iPTR 0))), addr:$dst)]>, VEX;
4141def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4142                       "movd\t{$src, $dst|$dst, $src}",
4143                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4144                                        (iPTR 0)))]>;
4145def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4146                       "movd\t{$src, $dst|$dst, $src}",
4147                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4148                                     (iPTR 0))), addr:$dst)]>;
4149
4150//===---------------------------------------------------------------------===//
4151// Move Packed Doubleword Int first element to Doubleword Int
4152//
4153def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4154                          "mov{d|q}\t{$src, $dst|$dst, $src}",
4155                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4156                                                           (iPTR 0)))]>,
4157                      TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>;
4158
4159def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4160                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4161                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4162                                                         (iPTR 0)))]>;
4163
4164//===---------------------------------------------------------------------===//
4165// Bitcast FR64 <-> GR64
4166//
4167let Predicates = [HasAVX] in
4168def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4169                        "vmovq\t{$src, $dst|$dst, $src}",
4170                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4171                        VEX;
4172def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4173                         "mov{d|q}\t{$src, $dst|$dst, $src}",
4174                         [(set GR64:$dst, (bitconvert FR64:$src))]>;
4175def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4176                         "movq\t{$src, $dst|$dst, $src}",
4177                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
4178
4179def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4180                       "movq\t{$src, $dst|$dst, $src}",
4181                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
4182def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4183                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4184                       [(set GR64:$dst, (bitconvert FR64:$src))]>;
4185def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4186                       "movq\t{$src, $dst|$dst, $src}",
4187                       [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
4188
4189//===---------------------------------------------------------------------===//
4190// Move Scalar Single to Double Int
4191//
4192def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4193                      "movd\t{$src, $dst|$dst, $src}",
4194                      [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX;
4195def VMOVSS2DImr  : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4196                      "movd\t{$src, $dst|$dst, $src}",
4197                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX;
4198def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4199                      "movd\t{$src, $dst|$dst, $src}",
4200                      [(set GR32:$dst, (bitconvert FR32:$src))]>;
4201def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4202                      "movd\t{$src, $dst|$dst, $src}",
4203                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
4204
4205//===---------------------------------------------------------------------===//
4206// Patterns and instructions to describe movd/movq to XMM register zero-extends
4207//
4208let AddedComplexity = 15 in {
4209def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4210                       "movd\t{$src, $dst|$dst, $src}",
4211                       [(set VR128:$dst, (v4i32 (X86vzmovl
4212                                      (v4i32 (scalar_to_vector GR32:$src)))))]>,
4213                                      VEX;
4214def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4215                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
4216                       [(set VR128:$dst, (v2i64 (X86vzmovl
4217                                      (v2i64 (scalar_to_vector GR64:$src)))))]>,
4218                                      VEX, VEX_W;
4219}
4220let AddedComplexity = 15 in {
4221def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4222                       "movd\t{$src, $dst|$dst, $src}",
4223                       [(set VR128:$dst, (v4i32 (X86vzmovl
4224                                      (v4i32 (scalar_to_vector GR32:$src)))))]>;
4225def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4226                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
4227                       [(set VR128:$dst, (v2i64 (X86vzmovl
4228                                      (v2i64 (scalar_to_vector GR64:$src)))))]>;
4229}
4230
4231let AddedComplexity = 20 in {
4232def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4233                       "movd\t{$src, $dst|$dst, $src}",
4234                       [(set VR128:$dst,
4235                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
4236                                                   (loadi32 addr:$src))))))]>,
4237                                                   VEX;
4238def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4239                       "movd\t{$src, $dst|$dst, $src}",
4240                       [(set VR128:$dst,
4241                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
4242                                                   (loadi32 addr:$src))))))]>;
4243}
4244
4245let Predicates = [HasSSE2], AddedComplexity = 20 in {
4246  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
4247            (MOVZDI2PDIrm addr:$src)>;
4248  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4249            (MOVZDI2PDIrm addr:$src)>;
4250  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4251            (MOVZDI2PDIrm addr:$src)>;
4252}
4253
4254let Predicates = [HasAVX] in {
4255  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
4256  let AddedComplexity = 20 in {
4257    def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
4258              (VMOVZDI2PDIrm addr:$src)>;
4259    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4260              (VMOVZDI2PDIrm addr:$src)>;
4261    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4262              (VMOVZDI2PDIrm addr:$src)>;
4263  }
4264  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4265  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4266                                (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))),
4267            (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
4268  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4269                                (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))),
4270            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
4271}
4272
4273// These are the correct encodings of the instructions so that we know how to
4274// read correct assembly, even though we continue to emit the wrong ones for
4275// compatibility with Darwin's buggy assembler.
4276def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4277                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4278def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4279                (MOV64toSDrr FR64:$dst, GR64:$src), 0>;
4280def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4281                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4282def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4283                (MOVSDto64rr GR64:$dst, FR64:$src), 0>;
4284def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4285                (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
4286def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4287                (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
4288
4289//===---------------------------------------------------------------------===//
4290// SSE2 - Move Quadword
4291//===---------------------------------------------------------------------===//
4292
4293//===---------------------------------------------------------------------===//
4294// Move Quadword Int to Packed Quadword Int
4295//
4296def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4297                    "vmovq\t{$src, $dst|$dst, $src}",
4298                    [(set VR128:$dst,
4299                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4300                    VEX, Requires<[HasAVX]>;
4301def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4302                    "movq\t{$src, $dst|$dst, $src}",
4303                    [(set VR128:$dst,
4304                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4305                    Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
4306
4307//===---------------------------------------------------------------------===//
4308// Move Packed Quadword Int to Quadword Int
4309//
4310def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4311                      "movq\t{$src, $dst|$dst, $src}",
4312                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4313                                    (iPTR 0))), addr:$dst)]>, VEX;
4314def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4315                      "movq\t{$src, $dst|$dst, $src}",
4316                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4317                                    (iPTR 0))), addr:$dst)]>;
4318
4319//===---------------------------------------------------------------------===//
4320// Store / copy lower 64-bits of a XMM register.
4321//
4322def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4323                     "movq\t{$src, $dst|$dst, $src}",
4324                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
4325def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4326                     "movq\t{$src, $dst|$dst, $src}",
4327                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
4328
4329let AddedComplexity = 20 in
4330def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4331                     "vmovq\t{$src, $dst|$dst, $src}",
4332                     [(set VR128:$dst,
4333                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4334                                                 (loadi64 addr:$src))))))]>,
4335                     XS, VEX, Requires<[HasAVX]>;
4336
4337let AddedComplexity = 20 in
4338def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4339                     "movq\t{$src, $dst|$dst, $src}",
4340                     [(set VR128:$dst,
4341                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4342                                                 (loadi64 addr:$src))))))]>,
4343                     XS, Requires<[HasSSE2]>;
4344
4345let Predicates = [HasSSE2], AddedComplexity = 20 in {
4346  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4347            (MOVZQI2PQIrm addr:$src)>;
4348  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4349            (MOVZQI2PQIrm addr:$src)>;
4350  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
4351}
4352
4353let Predicates = [HasAVX], AddedComplexity = 20 in {
4354  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
4355            (VMOVZQI2PQIrm addr:$src)>;
4356  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4357            (VMOVZQI2PQIrm addr:$src)>;
4358  def : Pat<(v2i64 (X86vzload addr:$src)),
4359            (VMOVZQI2PQIrm addr:$src)>;
4360}
4361
4362//===---------------------------------------------------------------------===//
4363// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4364// IA32 document. movq xmm1, xmm2 does clear the high bits.
4365//
4366let AddedComplexity = 15 in
4367def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4368                        "vmovq\t{$src, $dst|$dst, $src}",
4369                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4370                      XS, VEX, Requires<[HasAVX]>;
4371let AddedComplexity = 15 in
4372def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4373                        "movq\t{$src, $dst|$dst, $src}",
4374                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4375                      XS, Requires<[HasSSE2]>;
4376
4377let AddedComplexity = 20 in
4378def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4379                        "vmovq\t{$src, $dst|$dst, $src}",
4380                    [(set VR128:$dst, (v2i64 (X86vzmovl
4381                                             (loadv2i64 addr:$src))))]>,
4382                      XS, VEX, Requires<[HasAVX]>;
4383let AddedComplexity = 20 in {
4384def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4385                        "movq\t{$src, $dst|$dst, $src}",
4386                    [(set VR128:$dst, (v2i64 (X86vzmovl
4387                                             (loadv2i64 addr:$src))))]>,
4388                      XS, Requires<[HasSSE2]>;
4389}
4390
4391let AddedComplexity = 20 in {
4392  let Predicates = [HasSSE2] in {
4393    def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
4394              (MOVZPQILo2PQIrm addr:$src)>;
4395    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4396              (MOVZPQILo2PQIrr VR128:$src)>;
4397  }
4398  let Predicates = [HasAVX] in {
4399    def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
4400              (VMOVZPQILo2PQIrm addr:$src)>;
4401    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4402              (VMOVZPQILo2PQIrr VR128:$src)>;
4403  }
4404}
4405
4406// Instructions to match in the assembler
4407def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4408                      "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
4409def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4410                      "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
4411// Recognize "movd" with GR64 destination, but encode as a "movq"
4412def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4413                          "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
4414
4415// Instructions for the disassembler
4416// xr = XMM register
4417// xm = mem64
4418
4419let Predicates = [HasAVX] in
4420def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4421                 "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
4422def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4423                 "movq\t{$src, $dst|$dst, $src}", []>, XS;
4424
4425//===---------------------------------------------------------------------===//
4426// SSE3 - Conversion Instructions
4427//===---------------------------------------------------------------------===//
4428
4429// Convert Packed Double FP to Packed DW Integers
4430let Predicates = [HasAVX] in {
4431// The assembler can recognize rr 256-bit instructions by seeing a ymm
4432// register, but the same isn't true when using memory operands instead.
4433// Provide other assembly rr and rm forms to address this explicitly.
4434def VCVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4435                       "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
4436def VCVTPD2DQXrYr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
4437                       "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
4438
4439// XMM only
4440def VCVTPD2DQXrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4441                      "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
4442def VCVTPD2DQXrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
4443                      "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX;
4444
4445// YMM only
4446def VCVTPD2DQYrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
4447                      "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX;
4448def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
4449                      "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
4450}
4451
4452def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
4453                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
4454def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4455                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
4456
4457def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
4458          (VCVTPD2DQYrr VR256:$src)>;
4459def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
4460          (VCVTPD2DQYrm addr:$src)>;
4461
4462// Convert Packed DW Integers to Packed Double FP
4463let Predicates = [HasAVX] in {
4464def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
4465                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
4466def VCVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4467                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
4468def VCVTDQ2PDYrm  : S3SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
4469                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
4470def VCVTDQ2PDYrr  : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
4471                     "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
4472}
4473
4474def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
4475                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
4476def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4477                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
4478
4479// AVX 256-bit register conversion intrinsics
4480def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src),
4481           (VCVTDQ2PDYrr VR128:$src)>;
4482def : Pat<(int_x86_avx_cvtdq2_pd_256 (memopv4i32 addr:$src)),
4483           (VCVTDQ2PDYrm addr:$src)>;
4484
4485def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
4486          (VCVTPD2DQYrr VR256:$src)>;
4487def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
4488          (VCVTPD2DQYrm addr:$src)>;
4489
4490def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
4491          (VCVTDQ2PDYrr VR128:$src)>;
4492def : Pat<(v4f64 (sint_to_fp (memopv4i32 addr:$src))),
4493          (VCVTDQ2PDYrm addr:$src)>;
4494
4495//===---------------------------------------------------------------------===//
4496// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4497//===---------------------------------------------------------------------===//
4498multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4499                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4500                              X86MemOperand x86memop> {
4501def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4502                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4503                      [(set RC:$dst, (vt (OpNode RC:$src)))]>;
4504def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4505                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4506                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>;
4507}
4508
4509let Predicates = [HasAVX] in {
4510  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4511                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
4512  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4513                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
4514  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4515                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
4516  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4517                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
4518}
4519defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4520                                   memopv4f32, f128mem>;
4521defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4522                                   memopv4f32, f128mem>;
4523
4524let Predicates = [HasSSE3] in {
4525  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4526            (MOVSHDUPrr VR128:$src)>;
4527  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
4528            (MOVSHDUPrm addr:$src)>;
4529  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4530            (MOVSLDUPrr VR128:$src)>;
4531  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
4532            (MOVSLDUPrm addr:$src)>;
4533}
4534
4535let Predicates = [HasAVX] in {
4536  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4537            (VMOVSHDUPrr VR128:$src)>;
4538  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
4539            (VMOVSHDUPrm addr:$src)>;
4540  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4541            (VMOVSLDUPrr VR128:$src)>;
4542  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
4543            (VMOVSLDUPrm addr:$src)>;
4544  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4545            (VMOVSHDUPYrr VR256:$src)>;
4546  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
4547            (VMOVSHDUPYrm addr:$src)>;
4548  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4549            (VMOVSLDUPYrr VR256:$src)>;
4550  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
4551            (VMOVSLDUPYrm addr:$src)>;
4552}
4553
4554//===---------------------------------------------------------------------===//
4555// SSE3 - Replicate Double FP - MOVDDUP
4556//===---------------------------------------------------------------------===//
4557
4558multiclass sse3_replicate_dfp<string OpcodeStr> {
4559def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4560                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4561                    [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
4562def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4563                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4564                    [(set VR128:$dst,
4565                      (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)),
4566                                      (undef))))]>;
4567}
4568
4569// FIXME: Merge with above classe when there're patterns for the ymm version
4570multiclass sse3_replicate_dfp_y<string OpcodeStr> {
4571let Predicates = [HasAVX] in {
4572  def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4573                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4574                      []>;
4575  def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4576                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4577                      []>;
4578  }
4579}
4580
4581defm MOVDDUP : sse3_replicate_dfp<"movddup">;
4582defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
4583defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
4584
4585let Predicates = [HasSSE3] in {
4586  def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
4587                   (undef)),
4588            (MOVDDUPrm addr:$src)>;
4589  let AddedComplexity = 5 in {
4590  def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>;
4591  def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
4592            (MOVDDUPrm addr:$src)>;
4593  def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>;
4594  def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
4595            (MOVDDUPrm addr:$src)>;
4596  }
4597  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
4598            (MOVDDUPrm addr:$src)>;
4599  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
4600            (MOVDDUPrm addr:$src)>;
4601  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
4602            (MOVDDUPrm addr:$src)>;
4603  def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
4604            (MOVDDUPrm addr:$src)>;
4605  def : Pat<(X86Movddup (bc_v2f64
4606                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
4607            (MOVDDUPrm addr:$src)>;
4608}
4609
4610let Predicates = [HasAVX] in {
4611  def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
4612                   (undef)),
4613            (VMOVDDUPrm addr:$src)>;
4614  let AddedComplexity = 5 in {
4615  def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>;
4616  def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
4617            (VMOVDDUPrm addr:$src)>;
4618  def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>;
4619  def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
4620            (VMOVDDUPrm addr:$src)>;
4621  }
4622  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
4623            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4624  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
4625            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4626  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
4627            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4628  def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
4629            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4630  def : Pat<(X86Movddup (bc_v2f64
4631                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
4632            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4633
4634  // 256-bit version
4635  def : Pat<(X86Movddup (memopv4f64 addr:$src)),
4636            (VMOVDDUPYrm addr:$src)>;
4637  def : Pat<(X86Movddup (memopv4i64 addr:$src)),
4638            (VMOVDDUPYrm addr:$src)>;
4639  def : Pat<(X86Movddup (v4f64 (scalar_to_vector (loadf64 addr:$src)))),
4640            (VMOVDDUPYrm addr:$src)>;
4641  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
4642            (VMOVDDUPYrm addr:$src)>;
4643  def : Pat<(X86Movddup (v4f64 VR256:$src)),
4644            (VMOVDDUPYrr VR256:$src)>;
4645  def : Pat<(X86Movddup (v4i64 VR256:$src)),
4646            (VMOVDDUPYrr VR256:$src)>;
4647}
4648
4649//===---------------------------------------------------------------------===//
4650// SSE3 - Move Unaligned Integer
4651//===---------------------------------------------------------------------===//
4652
4653let Predicates = [HasAVX] in {
4654  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4655                   "vlddqu\t{$src, $dst|$dst, $src}",
4656                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
4657  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4658                   "vlddqu\t{$src, $dst|$dst, $src}",
4659                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, VEX;
4660}
4661def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4662                   "lddqu\t{$src, $dst|$dst, $src}",
4663                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
4664
4665//===---------------------------------------------------------------------===//
4666// SSE3 - Arithmetic
4667//===---------------------------------------------------------------------===//
4668
4669multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
4670                       X86MemOperand x86memop, bit Is2Addr = 1> {
4671  def rr : I<0xD0, MRMSrcReg,
4672       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4673       !if(Is2Addr,
4674           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4675           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4676       [(set RC:$dst, (Int RC:$src1, RC:$src2))]>;
4677  def rm : I<0xD0, MRMSrcMem,
4678       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4679       !if(Is2Addr,
4680           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4681           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4682       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>;
4683}
4684
4685let Predicates = [HasAVX],
4686  ExeDomain = SSEPackedDouble in {
4687  defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
4688                               f128mem, 0>, TB, XD, VEX_4V;
4689  defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
4690                               f128mem, 0>, TB, OpSize, VEX_4V;
4691  defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
4692                               f256mem, 0>, TB, XD, VEX_4V;
4693  defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
4694                               f256mem, 0>, TB, OpSize, VEX_4V;
4695}
4696let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
4697    ExeDomain = SSEPackedDouble in {
4698  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
4699                              f128mem>, TB, XD;
4700  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
4701                              f128mem>, TB, OpSize;
4702}
4703
4704//===---------------------------------------------------------------------===//
4705// SSE3 Instructions
4706//===---------------------------------------------------------------------===//
4707
4708// Horizontal ops
4709multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4710                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
4711  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4712       !if(Is2Addr,
4713         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4714         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4715      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
4716
4717  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4718       !if(Is2Addr,
4719         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4720         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4721      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
4722}
4723multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4724                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
4725  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4726       !if(Is2Addr,
4727         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4728         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4729      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
4730
4731  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4732       !if(Is2Addr,
4733         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4734         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4735      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
4736}
4737
4738let Predicates = [HasAVX] in {
4739  defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4740                          X86fhadd, 0>, VEX_4V;
4741  defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
4742                          X86fhadd, 0>, VEX_4V;
4743  defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4744                          X86fhsub, 0>, VEX_4V;
4745  defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
4746                          X86fhsub, 0>, VEX_4V;
4747  defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4748                          X86fhadd, 0>, VEX_4V;
4749  defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
4750                          X86fhadd, 0>, VEX_4V;
4751  defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4752                          X86fhsub, 0>, VEX_4V;
4753  defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
4754                          X86fhsub, 0>, VEX_4V;
4755}
4756
4757let Constraints = "$src1 = $dst" in {
4758  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
4759  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
4760  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
4761  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
4762}
4763
4764//===---------------------------------------------------------------------===//
4765// SSSE3 - Packed Absolute Instructions
4766//===---------------------------------------------------------------------===//
4767
4768
4769/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4770multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
4771                            PatFrag mem_frag128, Intrinsic IntId128> {
4772  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4773                    (ins VR128:$src),
4774                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4775                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
4776                    OpSize;
4777
4778  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4779                    (ins i128mem:$src),
4780                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4781                    [(set VR128:$dst,
4782                      (IntId128
4783                       (bitconvert (mem_frag128 addr:$src))))]>, OpSize;
4784}
4785
4786let Predicates = [HasAVX] in {
4787  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8,
4788                                  int_x86_ssse3_pabs_b_128>, VEX;
4789  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16,
4790                                  int_x86_ssse3_pabs_w_128>, VEX;
4791  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", memopv4i32,
4792                                  int_x86_ssse3_pabs_d_128>, VEX;
4793}
4794
4795defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", memopv16i8,
4796                              int_x86_ssse3_pabs_b_128>;
4797defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", memopv8i16,
4798                              int_x86_ssse3_pabs_w_128>;
4799defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", memopv4i32,
4800                              int_x86_ssse3_pabs_d_128>;
4801
4802//===---------------------------------------------------------------------===//
4803// SSSE3 - Packed Binary Operator Instructions
4804//===---------------------------------------------------------------------===//
4805
4806/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4807multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4808                             PatFrag mem_frag128, Intrinsic IntId128,
4809                             bit Is2Addr = 1> {
4810  let isCommutable = 1 in
4811  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4812       (ins VR128:$src1, VR128:$src2),
4813       !if(Is2Addr,
4814         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4815         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4816       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4817       OpSize;
4818  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4819       (ins VR128:$src1, i128mem:$src2),
4820       !if(Is2Addr,
4821         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4822         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4823       [(set VR128:$dst,
4824         (IntId128 VR128:$src1,
4825          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
4826}
4827
4828let ImmT = NoImm, Predicates = [HasAVX] in {
4829let isCommutable = 0 in {
4830  defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16,
4831                                      int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
4832  defm VPHADDD    : SS3I_binop_rm_int<0x02, "vphaddd", memopv4i32,
4833                                      int_x86_ssse3_phadd_d_128, 0>, VEX_4V;
4834  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw", memopv8i16,
4835                                      int_x86_ssse3_phadd_sw_128, 0>, VEX_4V;
4836  defm VPHSUBW    : SS3I_binop_rm_int<0x05, "vphsubw", memopv8i16,
4837                                      int_x86_ssse3_phsub_w_128, 0>, VEX_4V;
4838  defm VPHSUBD    : SS3I_binop_rm_int<0x06, "vphsubd", memopv4i32,
4839                                      int_x86_ssse3_phsub_d_128, 0>, VEX_4V;
4840  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw", memopv8i16,
4841                                      int_x86_ssse3_phsub_sw_128, 0>, VEX_4V;
4842  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", memopv16i8,
4843                                      int_x86_ssse3_pmadd_ub_sw_128, 0>, VEX_4V;
4844  defm VPSHUFB    : SS3I_binop_rm_int<0x00, "vpshufb", memopv16i8,
4845                                      int_x86_ssse3_pshuf_b_128, 0>, VEX_4V;
4846  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb", memopv16i8,
4847                                      int_x86_ssse3_psign_b_128, 0>, VEX_4V;
4848  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw", memopv8i16,
4849                                      int_x86_ssse3_psign_w_128, 0>, VEX_4V;
4850  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd", memopv4i32,
4851                                      int_x86_ssse3_psign_d_128, 0>, VEX_4V;
4852}
4853defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv8i16,
4854                                      int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V;
4855}
4856
4857// None of these have i8 immediate fields.
4858let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4859let isCommutable = 0 in {
4860  defm PHADDW    : SS3I_binop_rm_int<0x01, "phaddw", memopv8i16,
4861                                     int_x86_ssse3_phadd_w_128>;
4862  defm PHADDD    : SS3I_binop_rm_int<0x02, "phaddd", memopv4i32,
4863                                     int_x86_ssse3_phadd_d_128>;
4864  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw", memopv8i16,
4865                                     int_x86_ssse3_phadd_sw_128>;
4866  defm PHSUBW    : SS3I_binop_rm_int<0x05, "phsubw", memopv8i16,
4867                                     int_x86_ssse3_phsub_w_128>;
4868  defm PHSUBD    : SS3I_binop_rm_int<0x06, "phsubd", memopv4i32,
4869                                     int_x86_ssse3_phsub_d_128>;
4870  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw", memopv8i16,
4871                                     int_x86_ssse3_phsub_sw_128>;
4872  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", memopv16i8,
4873                                     int_x86_ssse3_pmadd_ub_sw_128>;
4874  defm PSHUFB    : SS3I_binop_rm_int<0x00, "pshufb", memopv16i8,
4875                                     int_x86_ssse3_pshuf_b_128>;
4876  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", memopv16i8,
4877                                     int_x86_ssse3_psign_b_128>;
4878  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", memopv8i16,
4879                                     int_x86_ssse3_psign_w_128>;
4880  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", memopv4i32,
4881                                       int_x86_ssse3_psign_d_128>;
4882}
4883defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv8i16,
4884                                     int_x86_ssse3_pmul_hr_sw_128>;
4885}
4886
4887let Predicates = [HasSSSE3] in {
4888  def : Pat<(X86pshufb VR128:$src, VR128:$mask),
4889            (PSHUFBrr128 VR128:$src, VR128:$mask)>;
4890  def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
4891            (PSHUFBrm128 VR128:$src, addr:$mask)>;
4892
4893  def : Pat<(X86psignb VR128:$src1, VR128:$src2),
4894            (PSIGNBrr128 VR128:$src1, VR128:$src2)>;
4895  def : Pat<(X86psignw VR128:$src1, VR128:$src2),
4896            (PSIGNWrr128 VR128:$src1, VR128:$src2)>;
4897  def : Pat<(X86psignd VR128:$src1, VR128:$src2),
4898            (PSIGNDrr128 VR128:$src1, VR128:$src2)>;
4899}
4900
4901let Predicates = [HasAVX] in {
4902  def : Pat<(X86pshufb VR128:$src, VR128:$mask),
4903            (VPSHUFBrr128 VR128:$src, VR128:$mask)>;
4904  def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
4905            (VPSHUFBrm128 VR128:$src, addr:$mask)>;
4906
4907  def : Pat<(X86psignb VR128:$src1, VR128:$src2),
4908            (VPSIGNBrr128 VR128:$src1, VR128:$src2)>;
4909  def : Pat<(X86psignw VR128:$src1, VR128:$src2),
4910            (VPSIGNWrr128 VR128:$src1, VR128:$src2)>;
4911  def : Pat<(X86psignd VR128:$src1, VR128:$src2),
4912            (VPSIGNDrr128 VR128:$src1, VR128:$src2)>;
4913}
4914
4915//===---------------------------------------------------------------------===//
4916// SSSE3 - Packed Align Instruction Patterns
4917//===---------------------------------------------------------------------===//
4918
4919multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
4920  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
4921      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
4922      !if(Is2Addr,
4923        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4924        !strconcat(asm,
4925                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4926      []>, OpSize;
4927  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
4928      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
4929      !if(Is2Addr,
4930        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4931        !strconcat(asm,
4932                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4933      []>, OpSize;
4934}
4935
4936let Predicates = [HasAVX] in
4937  defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
4938let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in
4939  defm PALIGN : ssse3_palign<"palignr">;
4940
4941let Predicates = [HasSSSE3] in {
4942def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
4943          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
4944def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
4945          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
4946def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
4947          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
4948def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
4949          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
4950}
4951
4952let Predicates = [HasAVX] in {
4953def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
4954          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
4955def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
4956          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
4957def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
4958          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
4959def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
4960          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
4961}
4962
4963//===---------------------------------------------------------------------===//
4964// SSSE3 - Thread synchronization
4965//===---------------------------------------------------------------------===//
4966
4967let usesCustomInserter = 1 in {
4968def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
4969                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>;
4970def MWAIT : PseudoI<(outs), (ins GR32:$src1, GR32:$src2),
4971                [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>;
4972}
4973
4974let Uses = [EAX, ECX, EDX] in
4975def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, TB,
4976                 Requires<[HasSSE3]>;
4977let Uses = [ECX, EAX] in
4978def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait", []>, TB,
4979                Requires<[HasSSE3]>;
4980
4981def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
4982def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
4983
4984def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
4985      Requires<[In32BitMode]>;
4986def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
4987      Requires<[In64BitMode]>;
4988
4989//===----------------------------------------------------------------------===//
4990// SSE4.1 - Packed Move with Sign/Zero Extend
4991//===----------------------------------------------------------------------===//
4992
4993multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
4994  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4995                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4996                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
4997
4998  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4999                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5000       [(set VR128:$dst,
5001         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
5002       OpSize;
5003}
5004
5005let Predicates = [HasAVX] in {
5006defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
5007                                     VEX;
5008defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
5009                                     VEX;
5010defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>,
5011                                     VEX;
5012defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>,
5013                                     VEX;
5014defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>,
5015                                     VEX;
5016defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
5017                                     VEX;
5018}
5019
5020defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
5021defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
5022defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
5023defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
5024defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
5025defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
5026
5027let Predicates = [HasSSE41] in {
5028  // Common patterns involving scalar load.
5029  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5030            (PMOVSXBWrm addr:$src)>;
5031  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5032            (PMOVSXBWrm addr:$src)>;
5033
5034  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5035            (PMOVSXWDrm addr:$src)>;
5036  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5037            (PMOVSXWDrm addr:$src)>;
5038
5039  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5040            (PMOVSXDQrm addr:$src)>;
5041  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5042            (PMOVSXDQrm addr:$src)>;
5043
5044  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5045            (PMOVZXBWrm addr:$src)>;
5046  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5047            (PMOVZXBWrm addr:$src)>;
5048
5049  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5050            (PMOVZXWDrm addr:$src)>;
5051  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5052            (PMOVZXWDrm addr:$src)>;
5053
5054  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5055            (PMOVZXDQrm addr:$src)>;
5056  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5057            (PMOVZXDQrm addr:$src)>;
5058}
5059
5060let Predicates = [HasAVX] in {
5061  // Common patterns involving scalar load.
5062  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5063            (VPMOVSXBWrm addr:$src)>;
5064  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5065            (VPMOVSXBWrm addr:$src)>;
5066
5067  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5068            (VPMOVSXWDrm addr:$src)>;
5069  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5070            (VPMOVSXWDrm addr:$src)>;
5071
5072  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5073            (VPMOVSXDQrm addr:$src)>;
5074  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5075            (VPMOVSXDQrm addr:$src)>;
5076
5077  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5078            (VPMOVZXBWrm addr:$src)>;
5079  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5080            (VPMOVZXBWrm addr:$src)>;
5081
5082  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5083            (VPMOVZXWDrm addr:$src)>;
5084  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5085            (VPMOVZXWDrm addr:$src)>;
5086
5087  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5088            (VPMOVZXDQrm addr:$src)>;
5089  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5090            (VPMOVZXDQrm addr:$src)>;
5091}
5092
5093
5094multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
5095  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5096                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5097                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
5098
5099  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
5100                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5101       [(set VR128:$dst,
5102         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
5103          OpSize;
5104}
5105
5106let Predicates = [HasAVX] in {
5107defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
5108                                     VEX;
5109defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
5110                                     VEX;
5111defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>,
5112                                     VEX;
5113defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
5114                                     VEX;
5115}
5116
5117defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
5118defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
5119defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
5120defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
5121
5122let Predicates = [HasSSE41] in {
5123  // Common patterns involving scalar load
5124  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5125            (PMOVSXBDrm addr:$src)>;
5126  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5127            (PMOVSXWQrm addr:$src)>;
5128
5129  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5130            (PMOVZXBDrm addr:$src)>;
5131  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5132            (PMOVZXWQrm addr:$src)>;
5133}
5134
5135let Predicates = [HasAVX] in {
5136  // Common patterns involving scalar load
5137  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5138            (VPMOVSXBDrm addr:$src)>;
5139  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5140            (VPMOVSXWQrm addr:$src)>;
5141
5142  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5143            (VPMOVZXBDrm addr:$src)>;
5144  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5145            (VPMOVZXWQrm addr:$src)>;
5146}
5147
5148multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
5149  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5150                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5151                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
5152
5153  // Expecting a i16 load any extended to i32 value.
5154  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
5155                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5156                 [(set VR128:$dst, (IntId (bitconvert
5157                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
5158                 OpSize;
5159}
5160
5161let Predicates = [HasAVX] in {
5162defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
5163                                     VEX;
5164defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
5165                                     VEX;
5166}
5167defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
5168defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
5169
5170let Predicates = [HasSSE41] in {
5171  // Common patterns involving scalar load
5172  def : Pat<(int_x86_sse41_pmovsxbq
5173              (bitconvert (v4i32 (X86vzmovl
5174                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5175            (PMOVSXBQrm addr:$src)>;
5176
5177  def : Pat<(int_x86_sse41_pmovzxbq
5178              (bitconvert (v4i32 (X86vzmovl
5179                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5180            (PMOVZXBQrm addr:$src)>;
5181}
5182
5183let Predicates = [HasAVX] in {
5184  // Common patterns involving scalar load
5185  def : Pat<(int_x86_sse41_pmovsxbq
5186              (bitconvert (v4i32 (X86vzmovl
5187                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5188            (VPMOVSXBQrm addr:$src)>;
5189
5190  def : Pat<(int_x86_sse41_pmovzxbq
5191              (bitconvert (v4i32 (X86vzmovl
5192                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
5193            (VPMOVZXBQrm addr:$src)>;
5194}
5195
5196//===----------------------------------------------------------------------===//
5197// SSE4.1 - Extract Instructions
5198//===----------------------------------------------------------------------===//
5199
5200/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5201multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5202  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5203                 (ins VR128:$src1, i32i8imm:$src2),
5204                 !strconcat(OpcodeStr,
5205                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5206                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
5207                 OpSize;
5208  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5209                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
5210                 !strconcat(OpcodeStr,
5211                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5212                 []>, OpSize;
5213// FIXME:
5214// There's an AssertZext in the way of writing the store pattern
5215// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
5216}
5217
5218let Predicates = [HasAVX] in {
5219  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
5220  def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
5221         (ins VR128:$src1, i32i8imm:$src2),
5222         "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX;
5223}
5224
5225defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5226
5227
5228/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5229multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5230  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5231                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
5232                 !strconcat(OpcodeStr,
5233                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5234                 []>, OpSize;
5235// FIXME:
5236// There's an AssertZext in the way of writing the store pattern
5237// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
5238}
5239
5240let Predicates = [HasAVX] in
5241  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
5242
5243defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5244
5245
5246/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5247multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5248  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5249                 (ins VR128:$src1, i32i8imm:$src2),
5250                 !strconcat(OpcodeStr,
5251                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5252                 [(set GR32:$dst,
5253                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
5254  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5255                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
5256                 !strconcat(OpcodeStr,
5257                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5258                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5259                          addr:$dst)]>, OpSize;
5260}
5261
5262let Predicates = [HasAVX] in
5263  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5264
5265defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5266
5267/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5268multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5269  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5270                 (ins VR128:$src1, i32i8imm:$src2),
5271                 !strconcat(OpcodeStr,
5272                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5273                 [(set GR64:$dst,
5274                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
5275  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5276                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
5277                 !strconcat(OpcodeStr,
5278                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5279                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5280                          addr:$dst)]>, OpSize, REX_W;
5281}
5282
5283let Predicates = [HasAVX] in
5284  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5285
5286defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
5287
5288/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5289/// destination
5290multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5291  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5292                 (ins VR128:$src1, i32i8imm:$src2),
5293                 !strconcat(OpcodeStr,
5294                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5295                 [(set GR32:$dst,
5296                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5297           OpSize;
5298  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5299                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
5300                 !strconcat(OpcodeStr,
5301                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5302                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5303                          addr:$dst)]>, OpSize;
5304}
5305
5306let Predicates = [HasAVX] in {
5307  defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
5308  def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
5309                  (ins VR128:$src1, i32i8imm:$src2),
5310                  "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
5311                  []>, OpSize, VEX;
5312}
5313defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5314
5315// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
5316def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
5317                                              imm:$src2))),
5318                 addr:$dst),
5319          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
5320          Requires<[HasSSE41]>;
5321def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
5322                                              imm:$src2))),
5323                 addr:$dst),
5324          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
5325          Requires<[HasAVX]>;
5326
5327//===----------------------------------------------------------------------===//
5328// SSE4.1 - Insert Instructions
5329//===----------------------------------------------------------------------===//
5330
5331multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5332  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5333      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
5334      !if(Is2Addr,
5335        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5336        !strconcat(asm,
5337                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5338      [(set VR128:$dst,
5339        (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
5340  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5341      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
5342      !if(Is2Addr,
5343        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5344        !strconcat(asm,
5345                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5346      [(set VR128:$dst,
5347        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
5348                   imm:$src3))]>, OpSize;
5349}
5350
5351let Predicates = [HasAVX] in
5352  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
5353let Constraints = "$src1 = $dst" in
5354  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5355
5356multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5357  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5358      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
5359      !if(Is2Addr,
5360        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5361        !strconcat(asm,
5362                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5363      [(set VR128:$dst,
5364        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5365      OpSize;
5366  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5367      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
5368      !if(Is2Addr,
5369        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5370        !strconcat(asm,
5371                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5372      [(set VR128:$dst,
5373        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
5374                          imm:$src3)))]>, OpSize;
5375}
5376
5377let Predicates = [HasAVX] in
5378  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5379let Constraints = "$src1 = $dst" in
5380  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5381
5382multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5383  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5384      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
5385      !if(Is2Addr,
5386        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5387        !strconcat(asm,
5388                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5389      [(set VR128:$dst,
5390        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5391      OpSize;
5392  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5393      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
5394      !if(Is2Addr,
5395        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5396        !strconcat(asm,
5397                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5398      [(set VR128:$dst,
5399        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
5400                          imm:$src3)))]>, OpSize;
5401}
5402
5403let Predicates = [HasAVX] in
5404  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5405let Constraints = "$src1 = $dst" in
5406  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5407
5408// insertps has a few different modes, there's the first two here below which
5409// are optimized inserts that won't zero arbitrary elements in the destination
5410// vector. The next one matches the intrinsic and could zero arbitrary elements
5411// in the target vector.
5412multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5413  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5414      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
5415      !if(Is2Addr,
5416        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5417        !strconcat(asm,
5418                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5419      [(set VR128:$dst,
5420        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
5421      OpSize;
5422  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5423      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
5424      !if(Is2Addr,
5425        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5426        !strconcat(asm,
5427                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5428      [(set VR128:$dst,
5429        (X86insrtps VR128:$src1,
5430                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5431                    imm:$src3))]>, OpSize;
5432}
5433
5434let Constraints = "$src1 = $dst" in
5435  defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
5436let Predicates = [HasAVX] in
5437  defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
5438
5439def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
5440          (VINSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>,
5441          Requires<[HasAVX]>;
5442def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
5443          (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>,
5444          Requires<[HasSSE41]>;
5445
5446//===----------------------------------------------------------------------===//
5447// SSE4.1 - Round Instructions
5448//===----------------------------------------------------------------------===//
5449
5450multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
5451                            X86MemOperand x86memop, RegisterClass RC,
5452                            PatFrag mem_frag32, PatFrag mem_frag64,
5453                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
5454  // Intrinsic operation, reg.
5455  // Vector intrinsic operation, reg
5456  def PSr : SS4AIi8<opcps, MRMSrcReg,
5457                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
5458                    !strconcat(OpcodeStr,
5459                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5460                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>,
5461                    OpSize;
5462
5463  // Vector intrinsic operation, mem
5464  def PSm : Ii8<opcps, MRMSrcMem,
5465                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
5466                    !strconcat(OpcodeStr,
5467                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5468                    [(set RC:$dst,
5469                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
5470                    TA, OpSize,
5471                Requires<[HasSSE41]>;
5472
5473  // Vector intrinsic operation, reg
5474  def PDr : SS4AIi8<opcpd, MRMSrcReg,
5475                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
5476                    !strconcat(OpcodeStr,
5477                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5478                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>,
5479                    OpSize;
5480
5481  // Vector intrinsic operation, mem
5482  def PDm : SS4AIi8<opcpd, MRMSrcMem,
5483                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
5484                    !strconcat(OpcodeStr,
5485                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5486                    [(set RC:$dst,
5487                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
5488                    OpSize;
5489}
5490
5491multiclass sse41_fp_unop_rm_avx_p<bits<8> opcps, bits<8> opcpd,
5492                   RegisterClass RC, X86MemOperand x86memop, string OpcodeStr> {
5493  // Intrinsic operation, reg.
5494  // Vector intrinsic operation, reg
5495  def PSr_AVX : SS4AIi8<opcps, MRMSrcReg,
5496                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
5497                    !strconcat(OpcodeStr,
5498                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5499                    []>, OpSize;
5500
5501  // Vector intrinsic operation, mem
5502  def PSm_AVX : Ii8<opcps, MRMSrcMem,
5503                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
5504                    !strconcat(OpcodeStr,
5505                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5506                    []>, TA, OpSize, Requires<[HasSSE41]>;
5507
5508  // Vector intrinsic operation, reg
5509  def PDr_AVX : SS4AIi8<opcpd, MRMSrcReg,
5510                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
5511                    !strconcat(OpcodeStr,
5512                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5513                    []>, OpSize;
5514
5515  // Vector intrinsic operation, mem
5516  def PDm_AVX : SS4AIi8<opcpd, MRMSrcMem,
5517                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
5518                    !strconcat(OpcodeStr,
5519                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5520                    []>, OpSize;
5521}
5522
5523multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
5524                            string OpcodeStr,
5525                            Intrinsic F32Int,
5526                            Intrinsic F64Int, bit Is2Addr = 1> {
5527  // Intrinsic operation, reg.
5528  def SSr : SS4AIi8<opcss, MRMSrcReg,
5529        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
5530        !if(Is2Addr,
5531            !strconcat(OpcodeStr,
5532                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5533            !strconcat(OpcodeStr,
5534                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5535        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
5536        OpSize;
5537
5538  // Intrinsic operation, mem.
5539  def SSm : SS4AIi8<opcss, MRMSrcMem,
5540        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
5541        !if(Is2Addr,
5542            !strconcat(OpcodeStr,
5543                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5544            !strconcat(OpcodeStr,
5545                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5546        [(set VR128:$dst,
5547             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
5548        OpSize;
5549
5550  // Intrinsic operation, reg.
5551  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5552        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
5553        !if(Is2Addr,
5554            !strconcat(OpcodeStr,
5555                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5556            !strconcat(OpcodeStr,
5557                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5558        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
5559        OpSize;
5560
5561  // Intrinsic operation, mem.
5562  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5563        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
5564        !if(Is2Addr,
5565            !strconcat(OpcodeStr,
5566                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5567            !strconcat(OpcodeStr,
5568                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5569        [(set VR128:$dst,
5570              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
5571        OpSize;
5572}
5573
5574multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd,
5575                                   string OpcodeStr> {
5576  // Intrinsic operation, reg.
5577  def SSr_AVX : SS4AIi8<opcss, MRMSrcReg,
5578        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
5579        !strconcat(OpcodeStr,
5580                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5581        []>, OpSize;
5582
5583  // Intrinsic operation, mem.
5584  def SSm_AVX : SS4AIi8<opcss, MRMSrcMem,
5585        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
5586        !strconcat(OpcodeStr,
5587                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5588        []>, OpSize;
5589
5590  // Intrinsic operation, reg.
5591  def SDr_AVX : SS4AIi8<opcsd, MRMSrcReg,
5592        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
5593            !strconcat(OpcodeStr,
5594                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5595        []>, OpSize;
5596
5597  // Intrinsic operation, mem.
5598  def SDm_AVX : SS4AIi8<opcsd, MRMSrcMem,
5599        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
5600            !strconcat(OpcodeStr,
5601                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5602        []>, OpSize;
5603}
5604
5605// FP round - roundss, roundps, roundsd, roundpd
5606let Predicates = [HasAVX] in {
5607  // Intrinsic form
5608  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
5609                                  memopv4f32, memopv2f64,
5610                                  int_x86_sse41_round_ps,
5611                                  int_x86_sse41_round_pd>, VEX;
5612  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
5613                                  memopv8f32, memopv4f64,
5614                                  int_x86_avx_round_ps_256,
5615                                  int_x86_avx_round_pd_256>, VEX;
5616  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
5617                                  int_x86_sse41_round_ss,
5618                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
5619
5620  // Instructions for the assembler
5621  defm VROUND  : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR128, f128mem, "vround">,
5622                                        VEX;
5623  defm VROUNDY : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR256, f256mem, "vround">,
5624                                        VEX;
5625  defm VROUND  : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
5626}
5627
5628defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
5629                               memopv4f32, memopv2f64,
5630                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
5631let Constraints = "$src1 = $dst" in
5632defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
5633                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
5634
5635//===----------------------------------------------------------------------===//
5636// SSE4.1 - Packed Bit Test
5637//===----------------------------------------------------------------------===//
5638
5639// ptest instruction we'll lower to this in X86ISelLowering primarily from
5640// the intel intrinsic that corresponds to this.
5641let Defs = [EFLAGS], Predicates = [HasAVX] in {
5642def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5643                "vptest\t{$src2, $src1|$src1, $src2}",
5644                [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
5645                OpSize, VEX;
5646def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5647                "vptest\t{$src2, $src1|$src1, $src2}",
5648                [(set EFLAGS,(X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>,
5649                OpSize, VEX;
5650
5651def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5652                "vptest\t{$src2, $src1|$src1, $src2}",
5653                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5654                OpSize, VEX;
5655def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5656                "vptest\t{$src2, $src1|$src1, $src2}",
5657                [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
5658                OpSize, VEX;
5659}
5660
5661let Defs = [EFLAGS] in {
5662def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5663              "ptest \t{$src2, $src1|$src1, $src2}",
5664              [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
5665              OpSize;
5666def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5667              "ptest \t{$src2, $src1|$src1, $src2}",
5668              [(set EFLAGS, (X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>,
5669              OpSize;
5670}
5671
5672// The bit test instructions below are AVX only
5673multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5674                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
5675  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5676            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5677            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX;
5678  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5679            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5680            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5681            OpSize, VEX;
5682}
5683
5684let Defs = [EFLAGS], Predicates = [HasAVX] in {
5685defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
5686defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>;
5687defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
5688defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>;
5689}
5690
5691//===----------------------------------------------------------------------===//
5692// SSE4.1 - Misc Instructions
5693//===----------------------------------------------------------------------===//
5694
5695let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5696  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5697                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5698                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5699                     OpSize, XS;
5700  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5701                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5702                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5703                      (implicit EFLAGS)]>, OpSize, XS;
5704
5705  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5706                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5707                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5708                     XS;
5709  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5710                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5711                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5712                      (implicit EFLAGS)]>, XS;
5713
5714  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5715                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5716                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5717                      XS;
5718  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5719                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5720                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5721                       (implicit EFLAGS)]>, XS;
5722}
5723
5724
5725
5726// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5727multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5728                                 Intrinsic IntId128> {
5729  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5730                    (ins VR128:$src),
5731                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5732                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
5733  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5734                     (ins i128mem:$src),
5735                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5736                     [(set VR128:$dst,
5737                       (IntId128
5738                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
5739}
5740
5741let Predicates = [HasAVX] in
5742defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
5743                                         int_x86_sse41_phminposuw>, VEX;
5744defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
5745                                         int_x86_sse41_phminposuw>;
5746
5747/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
5748multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
5749                              Intrinsic IntId128, bit Is2Addr = 1> {
5750  let isCommutable = 1 in
5751  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5752       (ins VR128:$src1, VR128:$src2),
5753       !if(Is2Addr,
5754           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5755           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5756       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize;
5757  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5758       (ins VR128:$src1, i128mem:$src2),
5759       !if(Is2Addr,
5760           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5761           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5762       [(set VR128:$dst,
5763         (IntId128 VR128:$src1,
5764          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
5765}
5766
5767let Predicates = [HasAVX] in {
5768  let isCommutable = 0 in
5769  defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
5770                                                         0>, VEX_4V;
5771  defm VPCMPEQQ  : SS41I_binop_rm_int<0x29, "vpcmpeqq",  int_x86_sse41_pcmpeqq,
5772                                                         0>, VEX_4V;
5773  defm VPMINSB   : SS41I_binop_rm_int<0x38, "vpminsb",   int_x86_sse41_pminsb,
5774                                                         0>, VEX_4V;
5775  defm VPMINSD   : SS41I_binop_rm_int<0x39, "vpminsd",   int_x86_sse41_pminsd,
5776                                                         0>, VEX_4V;
5777  defm VPMINUD   : SS41I_binop_rm_int<0x3B, "vpminud",   int_x86_sse41_pminud,
5778                                                         0>, VEX_4V;
5779  defm VPMINUW   : SS41I_binop_rm_int<0x3A, "vpminuw",   int_x86_sse41_pminuw,
5780                                                         0>, VEX_4V;
5781  defm VPMAXSB   : SS41I_binop_rm_int<0x3C, "vpmaxsb",   int_x86_sse41_pmaxsb,
5782                                                         0>, VEX_4V;
5783  defm VPMAXSD   : SS41I_binop_rm_int<0x3D, "vpmaxsd",   int_x86_sse41_pmaxsd,
5784                                                         0>, VEX_4V;
5785  defm VPMAXUD   : SS41I_binop_rm_int<0x3F, "vpmaxud",   int_x86_sse41_pmaxud,
5786                                                         0>, VEX_4V;
5787  defm VPMAXUW   : SS41I_binop_rm_int<0x3E, "vpmaxuw",   int_x86_sse41_pmaxuw,
5788                                                         0>, VEX_4V;
5789  defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
5790                                                         0>, VEX_4V;
5791
5792  def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
5793            (VPCMPEQQrr VR128:$src1, VR128:$src2)>;
5794  def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
5795            (VPCMPEQQrm VR128:$src1, addr:$src2)>;
5796}
5797
5798let Constraints = "$src1 = $dst" in {
5799  let isCommutable = 0 in
5800  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
5801  defm PCMPEQQ  : SS41I_binop_rm_int<0x29, "pcmpeqq",  int_x86_sse41_pcmpeqq>;
5802  defm PMINSB   : SS41I_binop_rm_int<0x38, "pminsb",   int_x86_sse41_pminsb>;
5803  defm PMINSD   : SS41I_binop_rm_int<0x39, "pminsd",   int_x86_sse41_pminsd>;
5804  defm PMINUD   : SS41I_binop_rm_int<0x3B, "pminud",   int_x86_sse41_pminud>;
5805  defm PMINUW   : SS41I_binop_rm_int<0x3A, "pminuw",   int_x86_sse41_pminuw>;
5806  defm PMAXSB   : SS41I_binop_rm_int<0x3C, "pmaxsb",   int_x86_sse41_pmaxsb>;
5807  defm PMAXSD   : SS41I_binop_rm_int<0x3D, "pmaxsd",   int_x86_sse41_pmaxsd>;
5808  defm PMAXUD   : SS41I_binop_rm_int<0x3F, "pmaxud",   int_x86_sse41_pmaxud>;
5809  defm PMAXUW   : SS41I_binop_rm_int<0x3E, "pmaxuw",   int_x86_sse41_pmaxuw>;
5810  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq>;
5811}
5812
5813def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
5814          (PCMPEQQrr VR128:$src1, VR128:$src2)>;
5815def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
5816          (PCMPEQQrm VR128:$src1, addr:$src2)>;
5817
5818/// SS48I_binop_rm - Simple SSE41 binary operator.
5819multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5820                        ValueType OpVT, bit Is2Addr = 1> {
5821  let isCommutable = 1 in
5822  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5823       (ins VR128:$src1, VR128:$src2),
5824       !if(Is2Addr,
5825           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5826           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5827       [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
5828       OpSize;
5829  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5830       (ins VR128:$src1, i128mem:$src2),
5831       !if(Is2Addr,
5832           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5833           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5834       [(set VR128:$dst, (OpNode VR128:$src1,
5835                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>,
5836       OpSize;
5837}
5838
5839let Predicates = [HasAVX] in
5840  defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V;
5841let Constraints = "$src1 = $dst" in
5842  defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>;
5843
5844/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5845multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5846                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5847                 X86MemOperand x86memop, bit Is2Addr = 1> {
5848  let isCommutable = 1 in
5849  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5850        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
5851        !if(Is2Addr,
5852            !strconcat(OpcodeStr,
5853                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5854            !strconcat(OpcodeStr,
5855                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5856        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
5857        OpSize;
5858  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5859        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
5860        !if(Is2Addr,
5861            !strconcat(OpcodeStr,
5862                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5863            !strconcat(OpcodeStr,
5864                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5865        [(set RC:$dst,
5866          (IntId RC:$src1,
5867           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
5868        OpSize;
5869}
5870
5871let Predicates = [HasAVX] in {
5872  let isCommutable = 0 in {
5873  defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
5874                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
5875  defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
5876                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
5877  defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
5878            int_x86_avx_blend_ps_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
5879  defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
5880            int_x86_avx_blend_pd_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
5881  defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
5882                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
5883  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
5884                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
5885  }
5886  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
5887                                   VR128, memopv16i8, i128mem, 0>, VEX_4V;
5888  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
5889                                   VR128, memopv16i8, i128mem, 0>, VEX_4V;
5890  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
5891                                   VR256, memopv32i8, i256mem, 0>, VEX_4V;
5892}
5893
5894let Constraints = "$src1 = $dst" in {
5895  let isCommutable = 0 in {
5896  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
5897                                     VR128, memopv16i8, i128mem>;
5898  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
5899                                     VR128, memopv16i8, i128mem>;
5900  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
5901                                     VR128, memopv16i8, i128mem>;
5902  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
5903                                     VR128, memopv16i8, i128mem>;
5904  }
5905  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
5906                                  VR128, memopv16i8, i128mem>;
5907  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
5908                                  VR128, memopv16i8, i128mem>;
5909}
5910
5911/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
5912let Predicates = [HasAVX] in {
5913multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
5914                                    RegisterClass RC, X86MemOperand x86memop,
5915                                    PatFrag mem_frag, Intrinsic IntId> {
5916  def rr : I<opc, MRMSrcReg, (outs RC:$dst),
5917                  (ins RC:$src1, RC:$src2, RC:$src3),
5918                  !strconcat(OpcodeStr,
5919                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5920                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
5921                  SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
5922
5923  def rm : I<opc, MRMSrcMem, (outs RC:$dst),
5924                  (ins RC:$src1, x86memop:$src2, RC:$src3),
5925                  !strconcat(OpcodeStr,
5926                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5927                  [(set RC:$dst,
5928                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
5929                               RC:$src3))],
5930                  SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
5931}
5932}
5933
5934defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem,
5935                                           memopv16i8, int_x86_sse41_blendvpd>;
5936defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
5937                                           memopv16i8, int_x86_sse41_blendvps>;
5938defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
5939                                           memopv16i8, int_x86_sse41_pblendvb>;
5940defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
5941                                         memopv32i8, int_x86_avx_blendv_pd_256>;
5942defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
5943                                         memopv32i8, int_x86_avx_blendv_ps_256>;
5944
5945let Predicates = [HasAVX] in {
5946  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
5947                            (v16i8 VR128:$src2))),
5948            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
5949  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
5950                            (v4i32 VR128:$src2))),
5951            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
5952  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
5953                            (v4f32 VR128:$src2))),
5954            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
5955  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
5956                            (v2i64 VR128:$src2))),
5957            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
5958  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
5959                            (v2f64 VR128:$src2))),
5960            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
5961  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
5962                            (v8i32 VR256:$src2))),
5963            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
5964  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
5965                            (v8f32 VR256:$src2))),
5966            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
5967  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
5968                            (v4i64 VR256:$src2))),
5969            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
5970  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
5971                            (v4f64 VR256:$src2))),
5972            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
5973}
5974
5975/// SS41I_ternary_int - SSE 4.1 ternary operator
5976let Uses = [XMM0], Constraints = "$src1 = $dst" in {
5977  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
5978    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5979                    (ins VR128:$src1, VR128:$src2),
5980                    !strconcat(OpcodeStr,
5981                     "\t{$src2, $dst|$dst, $src2}"),
5982                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
5983                    OpSize;
5984
5985    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5986                    (ins VR128:$src1, i128mem:$src2),
5987                    !strconcat(OpcodeStr,
5988                     "\t{$src2, $dst|$dst, $src2}"),
5989                    [(set VR128:$dst,
5990                      (IntId VR128:$src1,
5991                       (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
5992  }
5993}
5994
5995defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
5996defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
5997defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
5998
5999let Predicates = [HasSSE41] in {
6000  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
6001                            (v16i8 VR128:$src2))),
6002            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
6003  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
6004                            (v4i32 VR128:$src2))),
6005            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6006  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
6007                            (v4f32 VR128:$src2))),
6008            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6009  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
6010                            (v2i64 VR128:$src2))),
6011            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6012  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
6013                            (v2f64 VR128:$src2))),
6014            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6015}
6016
6017let Predicates = [HasAVX] in
6018def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6019                       "vmovntdqa\t{$src, $dst|$dst, $src}",
6020                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
6021                       OpSize, VEX;
6022def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6023                       "movntdqa\t{$src, $dst|$dst, $src}",
6024                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
6025                       OpSize;
6026
6027//===----------------------------------------------------------------------===//
6028// SSE4.2 - Compare Instructions
6029//===----------------------------------------------------------------------===//
6030
6031/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
6032multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
6033                              Intrinsic IntId128, bit Is2Addr = 1> {
6034  def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
6035       (ins VR128:$src1, VR128:$src2),
6036       !if(Is2Addr,
6037           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6038           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6039       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
6040       OpSize;
6041  def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
6042       (ins VR128:$src1, i128mem:$src2),
6043       !if(Is2Addr,
6044           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6045           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6046       [(set VR128:$dst,
6047         (IntId128 VR128:$src1,
6048          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
6049}
6050
6051let Predicates = [HasAVX] in {
6052  defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq,
6053                                     0>, VEX_4V;
6054
6055  def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
6056            (VPCMPGTQrr VR128:$src1, VR128:$src2)>;
6057  def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
6058            (VPCMPGTQrm VR128:$src1, addr:$src2)>;
6059}
6060
6061let Constraints = "$src1 = $dst" in
6062  defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
6063
6064def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
6065          (PCMPGTQrr VR128:$src1, VR128:$src2)>;
6066def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
6067          (PCMPGTQrm VR128:$src1, addr:$src2)>;
6068
6069//===----------------------------------------------------------------------===//
6070// SSE4.2 - String/text Processing Instructions
6071//===----------------------------------------------------------------------===//
6072
6073// Packed Compare Implicit Length Strings, Return Mask
6074multiclass pseudo_pcmpistrm<string asm> {
6075  def REG : PseudoI<(outs VR128:$dst),
6076                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6077    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
6078                                                  imm:$src3))]>;
6079  def MEM : PseudoI<(outs VR128:$dst),
6080                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6081    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128
6082                       VR128:$src1, (load addr:$src2), imm:$src3))]>;
6083}
6084
6085let Defs = [EFLAGS], usesCustomInserter = 1 in {
6086  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>;
6087  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
6088}
6089
6090let Defs = [XMM0, EFLAGS], Predicates = [HasAVX] in {
6091  def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
6092      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6093      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
6094  def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
6095      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6096      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
6097}
6098
6099let Defs = [XMM0, EFLAGS] in {
6100  def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
6101      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6102      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
6103  def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
6104      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6105      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
6106}
6107
6108// Packed Compare Explicit Length Strings, Return Mask
6109multiclass pseudo_pcmpestrm<string asm> {
6110  def REG : PseudoI<(outs VR128:$dst),
6111                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
6112    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
6113                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
6114  def MEM : PseudoI<(outs VR128:$dst),
6115                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
6116    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
6117                       VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>;
6118}
6119
6120let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
6121  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>;
6122  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
6123}
6124
6125let Predicates = [HasAVX],
6126    Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
6127  def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
6128      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
6129      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
6130  def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
6131      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
6132      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
6133}
6134
6135let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
6136  def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
6137      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
6138      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
6139  def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
6140      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
6141      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
6142}
6143
6144// Packed Compare Implicit Length Strings, Return Index
6145let Defs = [ECX, EFLAGS] in {
6146  multiclass SS42AI_pcmpistri<Intrinsic IntId128, string asm = "pcmpistri"> {
6147    def rr : SS42AI<0x63, MRMSrcReg, (outs),
6148      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6149      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6150      [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
6151       (implicit EFLAGS)]>, OpSize;
6152    def rm : SS42AI<0x63, MRMSrcMem, (outs),
6153      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6154      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6155      [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
6156       (implicit EFLAGS)]>, OpSize;
6157  }
6158}
6159
6160let Predicates = [HasAVX] in {
6161defm VPCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">,
6162                                    VEX;
6163defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">,
6164                                    VEX;
6165defm VPCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128, "vpcmpistri">,
6166                                    VEX;
6167defm VPCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128, "vpcmpistri">,
6168                                    VEX;
6169defm VPCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128, "vpcmpistri">,
6170                                    VEX;
6171defm VPCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128, "vpcmpistri">,
6172                                    VEX;
6173}
6174
6175defm PCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
6176defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
6177defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
6178defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
6179defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
6180defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;
6181
6182// Packed Compare Explicit Length Strings, Return Index
6183let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in {
6184  multiclass SS42AI_pcmpestri<Intrinsic IntId128, string asm = "pcmpestri"> {
6185    def rr : SS42AI<0x61, MRMSrcReg, (outs),
6186      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
6187      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6188      [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
6189       (implicit EFLAGS)]>, OpSize;
6190    def rm : SS42AI<0x61, MRMSrcMem, (outs),
6191      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
6192      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6193       [(set ECX,
6194             (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)),
6195        (implicit EFLAGS)]>, OpSize;
6196  }
6197}
6198
6199let Predicates = [HasAVX] in {
6200defm VPCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">,
6201                                    VEX;
6202defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">,
6203                                    VEX;
6204defm VPCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128, "vpcmpestri">,
6205                                    VEX;
6206defm VPCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128, "vpcmpestri">,
6207                                    VEX;
6208defm VPCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128, "vpcmpestri">,
6209                                    VEX;
6210defm VPCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128, "vpcmpestri">,
6211                                    VEX;
6212}
6213
6214defm PCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
6215defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
6216defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
6217defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
6218defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
6219defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
6220
6221//===----------------------------------------------------------------------===//
6222// SSE4.2 - CRC Instructions
6223//===----------------------------------------------------------------------===//
6224
6225// No CRC instructions have AVX equivalents
6226
6227// crc intrinsic instruction
6228// This set of instructions are only rm, the only difference is the size
6229// of r and m.
6230let Constraints = "$src1 = $dst" in {
6231  def CRC32r32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
6232                      (ins GR32:$src1, i8mem:$src2),
6233                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
6234                       [(set GR32:$dst,
6235                         (int_x86_sse42_crc32_32_8 GR32:$src1,
6236                         (load addr:$src2)))]>;
6237  def CRC32r32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
6238                      (ins GR32:$src1, GR8:$src2),
6239                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
6240                       [(set GR32:$dst,
6241                         (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
6242  def CRC32r32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
6243                      (ins GR32:$src1, i16mem:$src2),
6244                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
6245                       [(set GR32:$dst,
6246                         (int_x86_sse42_crc32_32_16 GR32:$src1,
6247                         (load addr:$src2)))]>,
6248                         OpSize;
6249  def CRC32r32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
6250                      (ins GR32:$src1, GR16:$src2),
6251                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
6252                       [(set GR32:$dst,
6253                         (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
6254                         OpSize;
6255  def CRC32r32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
6256                      (ins GR32:$src1, i32mem:$src2),
6257                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
6258                       [(set GR32:$dst,
6259                         (int_x86_sse42_crc32_32_32 GR32:$src1,
6260                         (load addr:$src2)))]>;
6261  def CRC32r32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
6262                      (ins GR32:$src1, GR32:$src2),
6263                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
6264                       [(set GR32:$dst,
6265                         (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
6266  def CRC32r64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
6267                      (ins GR64:$src1, i8mem:$src2),
6268                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
6269                       [(set GR64:$dst,
6270                         (int_x86_sse42_crc32_64_8 GR64:$src1,
6271                         (load addr:$src2)))]>,
6272                         REX_W;
6273  def CRC32r64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
6274                      (ins GR64:$src1, GR8:$src2),
6275                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
6276                       [(set GR64:$dst,
6277                         (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
6278                         REX_W;
6279  def CRC32r64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
6280                      (ins GR64:$src1, i64mem:$src2),
6281                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
6282                       [(set GR64:$dst,
6283                         (int_x86_sse42_crc32_64_64 GR64:$src1,
6284                         (load addr:$src2)))]>,
6285                         REX_W;
6286  def CRC32r64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
6287                      (ins GR64:$src1, GR64:$src2),
6288                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
6289                       [(set GR64:$dst,
6290                         (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
6291                         REX_W;
6292}
6293
6294//===----------------------------------------------------------------------===//
6295// AES-NI Instructions
6296//===----------------------------------------------------------------------===//
6297
6298multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6299                              Intrinsic IntId128, bit Is2Addr = 1> {
6300  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
6301       (ins VR128:$src1, VR128:$src2),
6302       !if(Is2Addr,
6303           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6304           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6305       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
6306       OpSize;
6307  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
6308       (ins VR128:$src1, i128mem:$src2),
6309       !if(Is2Addr,
6310           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6311           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6312       [(set VR128:$dst,
6313         (IntId128 VR128:$src1,
6314          (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
6315}
6316
6317// Perform One Round of an AES Encryption/Decryption Flow
6318let Predicates = [HasAVX, HasAES] in {
6319  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6320                         int_x86_aesni_aesenc, 0>, VEX_4V;
6321  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6322                         int_x86_aesni_aesenclast, 0>, VEX_4V;
6323  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6324                         int_x86_aesni_aesdec, 0>, VEX_4V;
6325  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6326                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
6327}
6328
6329let Constraints = "$src1 = $dst" in {
6330  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6331                         int_x86_aesni_aesenc>;
6332  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6333                         int_x86_aesni_aesenclast>;
6334  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6335                         int_x86_aesni_aesdec>;
6336  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6337                         int_x86_aesni_aesdeclast>;
6338}
6339
6340let Predicates = [HasAES] in {
6341  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
6342            (AESENCrr VR128:$src1, VR128:$src2)>;
6343  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
6344            (AESENCrm VR128:$src1, addr:$src2)>;
6345  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
6346            (AESENCLASTrr VR128:$src1, VR128:$src2)>;
6347  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
6348            (AESENCLASTrm VR128:$src1, addr:$src2)>;
6349  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
6350            (AESDECrr VR128:$src1, VR128:$src2)>;
6351  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
6352            (AESDECrm VR128:$src1, addr:$src2)>;
6353  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
6354            (AESDECLASTrr VR128:$src1, VR128:$src2)>;
6355  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
6356            (AESDECLASTrm VR128:$src1, addr:$src2)>;
6357}
6358
6359let Predicates = [HasAVX, HasAES], AddedComplexity = 20 in {
6360  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
6361            (VAESENCrr VR128:$src1, VR128:$src2)>;
6362  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
6363            (VAESENCrm VR128:$src1, addr:$src2)>;
6364  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
6365            (VAESENCLASTrr VR128:$src1, VR128:$src2)>;
6366  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
6367            (VAESENCLASTrm VR128:$src1, addr:$src2)>;
6368  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
6369            (VAESDECrr VR128:$src1, VR128:$src2)>;
6370  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
6371            (VAESDECrm VR128:$src1, addr:$src2)>;
6372  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
6373            (VAESDECLASTrr VR128:$src1, VR128:$src2)>;
6374  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
6375            (VAESDECLASTrm VR128:$src1, addr:$src2)>;
6376}
6377
6378// Perform the AES InvMixColumn Transformation
6379let Predicates = [HasAVX, HasAES] in {
6380  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6381      (ins VR128:$src1),
6382      "vaesimc\t{$src1, $dst|$dst, $src1}",
6383      [(set VR128:$dst,
6384        (int_x86_aesni_aesimc VR128:$src1))]>,
6385      OpSize, VEX;
6386  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6387      (ins i128mem:$src1),
6388      "vaesimc\t{$src1, $dst|$dst, $src1}",
6389      [(set VR128:$dst,
6390        (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
6391      OpSize, VEX;
6392}
6393def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6394  (ins VR128:$src1),
6395  "aesimc\t{$src1, $dst|$dst, $src1}",
6396  [(set VR128:$dst,
6397    (int_x86_aesni_aesimc VR128:$src1))]>,
6398  OpSize;
6399def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6400  (ins i128mem:$src1),
6401  "aesimc\t{$src1, $dst|$dst, $src1}",
6402  [(set VR128:$dst,
6403    (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
6404  OpSize;
6405
6406// AES Round Key Generation Assist
6407let Predicates = [HasAVX, HasAES] in {
6408  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6409      (ins VR128:$src1, i8imm:$src2),
6410      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6411      [(set VR128:$dst,
6412        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
6413      OpSize, VEX;
6414  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6415      (ins i128mem:$src1, i8imm:$src2),
6416      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6417      [(set VR128:$dst,
6418        (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
6419                                        imm:$src2))]>,
6420      OpSize, VEX;
6421}
6422def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6423  (ins VR128:$src1, i8imm:$src2),
6424  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6425  [(set VR128:$dst,
6426    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
6427  OpSize;
6428def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6429  (ins i128mem:$src1, i8imm:$src2),
6430  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6431  [(set VR128:$dst,
6432    (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
6433                                    imm:$src2))]>,
6434  OpSize;
6435
6436//===----------------------------------------------------------------------===//
6437// CLMUL Instructions
6438//===----------------------------------------------------------------------===//
6439
6440// Carry-less Multiplication instructions
6441let Constraints = "$src1 = $dst" in {
6442def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6443           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6444           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6445           []>;
6446
6447def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6448           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6449           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6450           []>;
6451}
6452
6453// AVX carry-less Multiplication instructions
6454def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6455           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
6456           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6457           []>;
6458
6459def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6460           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
6461           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6462           []>;
6463
6464
6465multiclass pclmul_alias<string asm, int immop> {
6466  def : InstAlias<!strconcat("pclmul", asm,
6467                           "dq {$src, $dst|$dst, $src}"),
6468                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
6469
6470  def : InstAlias<!strconcat("pclmul", asm,
6471                             "dq {$src, $dst|$dst, $src}"),
6472                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
6473
6474  def : InstAlias<!strconcat("vpclmul", asm,
6475                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
6476                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
6477
6478  def : InstAlias<!strconcat("vpclmul", asm,
6479                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
6480                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
6481}
6482defm : pclmul_alias<"hqhq", 0x11>;
6483defm : pclmul_alias<"hqlq", 0x01>;
6484defm : pclmul_alias<"lqhq", 0x10>;
6485defm : pclmul_alias<"lqlq", 0x00>;
6486
6487//===----------------------------------------------------------------------===//
6488// AVX Instructions
6489//===----------------------------------------------------------------------===//
6490
6491//===----------------------------------------------------------------------===//
6492// VBROADCAST - Load from memory and broadcast to all elements of the
6493//              destination operand
6494//
6495class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
6496                    X86MemOperand x86memop, Intrinsic Int> :
6497  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6498        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6499        [(set RC:$dst, (Int addr:$src))]>, VEX;
6500
6501def VBROADCASTSS   : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
6502                                   int_x86_avx_vbroadcastss>;
6503def VBROADCASTSSY  : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
6504                                   int_x86_avx_vbroadcastss_256>;
6505def VBROADCASTSD   : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
6506                                   int_x86_avx_vbroadcast_sd_256>;
6507def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
6508                                   int_x86_avx_vbroadcastf128_pd_256>;
6509
6510def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
6511          (VBROADCASTF128 addr:$src)>;
6512
6513def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
6514          (VBROADCASTSSY addr:$src)>;
6515def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
6516          (VBROADCASTSD addr:$src)>;
6517def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
6518          (VBROADCASTSSY addr:$src)>;
6519def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
6520          (VBROADCASTSD addr:$src)>;
6521
6522def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
6523          (VBROADCASTSS addr:$src)>;
6524def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
6525          (VBROADCASTSS addr:$src)>;
6526
6527//===----------------------------------------------------------------------===//
6528// VINSERTF128 - Insert packed floating-point values
6529//
6530def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
6531          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
6532          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6533          []>, VEX_4V;
6534def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
6535          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
6536          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6537          []>, VEX_4V;
6538
6539def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
6540          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
6541def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
6542          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
6543def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3),
6544          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
6545
6546def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
6547                                   (i32 imm)),
6548          (VINSERTF128rr VR256:$src1, VR128:$src2,
6549                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
6550def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
6551                                   (i32 imm)),
6552          (VINSERTF128rr VR256:$src1, VR128:$src2,
6553                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
6554def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
6555                                   (i32 imm)),
6556          (VINSERTF128rr VR256:$src1, VR128:$src2,
6557                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
6558def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
6559                                   (i32 imm)),
6560          (VINSERTF128rr VR256:$src1, VR128:$src2,
6561                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
6562def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
6563                                   (i32 imm)),
6564          (VINSERTF128rr VR256:$src1, VR128:$src2,
6565                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
6566def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
6567                                   (i32 imm)),
6568          (VINSERTF128rr VR256:$src1, VR128:$src2,
6569                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
6570
6571//===----------------------------------------------------------------------===//
6572// VEXTRACTF128 - Extract packed floating-point values
6573//
6574def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
6575          (ins VR256:$src1, i8imm:$src2),
6576          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6577          []>, VEX;
6578def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
6579          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
6580          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6581          []>, VEX;
6582
6583def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
6584          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
6585def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
6586          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
6587def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
6588          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
6589
6590def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
6591          (v4f32 (VEXTRACTF128rr
6592                    (v8f32 VR256:$src1),
6593                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
6594def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
6595          (v2f64 (VEXTRACTF128rr
6596                    (v4f64 VR256:$src1),
6597                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
6598def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
6599          (v4i32 (VEXTRACTF128rr
6600                    (v8i32 VR256:$src1),
6601                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
6602def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
6603          (v2i64 (VEXTRACTF128rr
6604                    (v4i64 VR256:$src1),
6605                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
6606def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
6607          (v8i16 (VEXTRACTF128rr
6608                    (v16i16 VR256:$src1),
6609                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
6610def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
6611          (v16i8 (VEXTRACTF128rr
6612                    (v32i8 VR256:$src1),
6613                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
6614
6615//===----------------------------------------------------------------------===//
6616// VMASKMOV - Conditional SIMD Packed Loads and Stores
6617//
6618multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
6619                          Intrinsic IntLd, Intrinsic IntLd256,
6620                          Intrinsic IntSt, Intrinsic IntSt256,
6621                          PatFrag pf128, PatFrag pf256> {
6622  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
6623             (ins VR128:$src1, f128mem:$src2),
6624             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6625             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
6626             VEX_4V;
6627  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
6628             (ins VR256:$src1, f256mem:$src2),
6629             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6630             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
6631             VEX_4V;
6632  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
6633             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
6634             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6635             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
6636  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
6637             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
6638             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6639             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
6640}
6641
6642defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
6643                                 int_x86_avx_maskload_ps,
6644                                 int_x86_avx_maskload_ps_256,
6645                                 int_x86_avx_maskstore_ps,
6646                                 int_x86_avx_maskstore_ps_256,
6647                                 memopv4f32, memopv8f32>;
6648defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
6649                                 int_x86_avx_maskload_pd,
6650                                 int_x86_avx_maskload_pd_256,
6651                                 int_x86_avx_maskstore_pd,
6652                                 int_x86_avx_maskstore_pd_256,
6653                                 memopv2f64, memopv4f64>;
6654
6655//===----------------------------------------------------------------------===//
6656// VPERMIL - Permute Single and Double Floating-Point Values
6657//
6658multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
6659                      RegisterClass RC, X86MemOperand x86memop_f,
6660                      X86MemOperand x86memop_i, PatFrag f_frag, PatFrag i_frag,
6661                      Intrinsic IntVar, Intrinsic IntImm> {
6662  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
6663             (ins RC:$src1, RC:$src2),
6664             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6665             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V;
6666  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
6667             (ins RC:$src1, x86memop_i:$src2),
6668             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6669             [(set RC:$dst, (IntVar RC:$src1, (i_frag addr:$src2)))]>, VEX_4V;
6670
6671  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
6672             (ins RC:$src1, i8imm:$src2),
6673             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6674             [(set RC:$dst, (IntImm RC:$src1, imm:$src2))]>, VEX;
6675  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
6676             (ins x86memop_f:$src1, i8imm:$src2),
6677             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6678             [(set RC:$dst, (IntImm (f_frag addr:$src1), imm:$src2))]>, VEX;
6679}
6680
6681defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
6682                             memopv4f32, memopv4i32,
6683                             int_x86_avx_vpermilvar_ps,
6684                             int_x86_avx_vpermil_ps>;
6685defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
6686                             memopv8f32, memopv8i32,
6687                             int_x86_avx_vpermilvar_ps_256,
6688                             int_x86_avx_vpermil_ps_256>;
6689defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
6690                             memopv2f64, memopv2i64,
6691                             int_x86_avx_vpermilvar_pd,
6692                             int_x86_avx_vpermil_pd>;
6693defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
6694                             memopv4f64, memopv4i64,
6695                             int_x86_avx_vpermilvar_pd_256,
6696                             int_x86_avx_vpermil_pd_256>;
6697
6698def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
6699          (VPERMILPSYri VR256:$src1, imm:$imm)>;
6700def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
6701          (VPERMILPDYri VR256:$src1, imm:$imm)>;
6702def : Pat<(v8i32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
6703          (VPERMILPSYri VR256:$src1, imm:$imm)>;
6704def : Pat<(v4i64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
6705          (VPERMILPDYri VR256:$src1, imm:$imm)>;
6706
6707//===----------------------------------------------------------------------===//
6708// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
6709//
6710def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
6711          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
6712          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6713          []>, VEX_4V;
6714def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
6715          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
6716          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6717          []>, VEX_4V;
6718
6719def : Pat<(int_x86_avx_vperm2f128_ps_256 VR256:$src1, VR256:$src2, imm:$src3),
6720          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
6721def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, VR256:$src2, imm:$src3),
6722          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
6723def : Pat<(int_x86_avx_vperm2f128_si_256 VR256:$src1, VR256:$src2, imm:$src3),
6724          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
6725
6726def : Pat<(int_x86_avx_vperm2f128_ps_256
6727                  VR256:$src1, (memopv8f32 addr:$src2), imm:$src3),
6728          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
6729def : Pat<(int_x86_avx_vperm2f128_pd_256
6730                  VR256:$src1, (memopv4f64 addr:$src2), imm:$src3),
6731          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
6732def : Pat<(int_x86_avx_vperm2f128_si_256
6733                  VR256:$src1, (memopv8i32 addr:$src2), imm:$src3),
6734          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
6735
6736def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
6737          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
6738def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
6739          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
6740def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
6741          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
6742def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
6743          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
6744def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
6745          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
6746def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
6747          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
6748
6749//===----------------------------------------------------------------------===//
6750// VZERO - Zero YMM registers
6751//
6752let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
6753            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
6754  // Zero All YMM registers
6755  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
6756                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>;
6757
6758  // Zero Upper bits of YMM registers
6759  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
6760                     [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>;
6761}
6762
6763//===----------------------------------------------------------------------===//
6764// Half precision conversion instructions
6765//
6766let Predicates = [HasAVX, HasF16C] in {
6767  def VCVTPH2PSrm : I<0x13, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
6768                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
6769  def VCVTPH2PSrr : I<0x13, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
6770                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
6771  def VCVTPH2PSYrm : I<0x13, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
6772                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
6773  def VCVTPH2PSYrr : I<0x13, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
6774                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
6775  def VCVTPS2PHmr : Ii8<0x1D, MRMDestMem, (outs f64mem:$dst),
6776                      (ins VR128:$src1, i32i8imm:$src2),
6777                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
6778                      TA, OpSize, VEX;
6779  def VCVTPS2PHrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
6780                      (ins VR128:$src1, i32i8imm:$src2),
6781                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
6782                      TA, OpSize, VEX;
6783  def VCVTPS2PHYmr : Ii8<0x1D, MRMDestMem, (outs f128mem:$dst),
6784                      (ins VR256:$src1, i32i8imm:$src2),
6785                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
6786                      TA, OpSize, VEX;
6787  def VCVTPS2PHYrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
6788                      (ins VR256:$src1, i32i8imm:$src2),
6789                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
6790                      TA, OpSize, VEX;
6791}
6792