• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the X86 SSE instruction set, defining the instructions,
10// and properties of the instructions which are needed for code generation,
11// machine code emission, and analysis.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// SSE 1 & 2 Instructions Classes
17//===----------------------------------------------------------------------===//
18
19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
21                           RegisterClass RC, X86MemOperand x86memop,
22                           Domain d, X86FoldableSchedWrite sched,
23                           bit Is2Addr = 1> {
24let isCodeGenOnly = 1 in {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, sched.ReadAfterFold]>;
39}
40}
41
42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43multiclass sse12_fp_scalar_int<bits<8> opc,
44                               SDPatternOperator OpNode, RegisterClass RC,
45                               ValueType VT, string asm, Operand memopr,
46                               PatFrags mem_frags, Domain d,
47                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48let hasSideEffects = 0 in {
49  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50       !if(Is2Addr,
51           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54       Sched<[sched]>;
55  let mayLoad = 1 in
56  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57       !if(Is2Addr,
58           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60       [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
61       Sched<[sched.Folded, sched.ReadAfterFold]>;
62}
63}
64
65/// sse12_fp_packed - SSE 1 & 2 packed instructions class
66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
67                           RegisterClass RC, ValueType vt,
68                           X86MemOperand x86memop, PatFrag mem_frag,
69                           Domain d, X86FoldableSchedWrite sched,
70                           bit Is2Addr = 1> {
71  let isCommutable = 1 in
72    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77       Sched<[sched]>;
78  let mayLoad = 1 in
79    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80       !if(Is2Addr,
81           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84          d>,
85       Sched<[sched.Folded, sched.ReadAfterFold]>;
86}
87
88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90                                      string OpcodeStr, X86MemOperand x86memop,
91                                      X86FoldableSchedWrite sched,
92                                      list<dag> pat_rr, list<dag> pat_rm,
93                                      bit Is2Addr = 1> {
94  let isCommutable = 1, hasSideEffects = 0 in
95    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96       !if(Is2Addr,
97           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99       pat_rr, d>,
100       Sched<[sched]>;
101  let hasSideEffects = 0, mayLoad = 1 in
102  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103       !if(Is2Addr,
104           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106       pat_rm, d>,
107       Sched<[sched.Folded, sched.ReadAfterFold]>;
108}
109
110
111// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112// This is expanded by ExpandPostRAPseudos.
113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114    isPseudo = 1, SchedRW = [WriteZero] in {
115  def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
116                   [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
117  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
118                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
119  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
120                   [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
121  def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
122                     [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
123}
124
125//===----------------------------------------------------------------------===//
126// AVX & SSE - Zero/One Vectors
127//===----------------------------------------------------------------------===//
128
129// Alias instruction that maps zero vector to pxor / xorp* for sse.
130// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
131// swizzled by ExecutionDomainFix to pxor.
132// We set canFoldAsLoad because this can be converted to a constant-pool
133// load of an all-zeros value if folding it would be beneficial.
134let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
135    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
136def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
137               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
138}
139
140let Predicates = [NoAVX512] in {
141def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
142def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
143def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
144def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
145def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
146def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
147}
148
149
150// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
151// and doesn't need it because on sandy bridge the register is set to zero
152// at the rename stage without using any execution unit, so SET0PSY
153// and SET0PDY can be used for vector int instructions without penalty
154let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
155    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
156def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
157                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
158}
159
160let Predicates = [NoAVX512] in {
161def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
162def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
163def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
164def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
165def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
166def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
167}
168
169// We set canFoldAsLoad because this can be converted to a constant-pool
170// load of an all-ones value if folding it would be beneficial.
171let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
172    isPseudo = 1, SchedRW = [WriteZero] in {
173  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
174                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
175  let Predicates = [HasAVX1Only, OptForMinSize] in {
176  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
177                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
178  }
179  let Predicates = [HasAVX2] in
180  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
181                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
182}
183
184//===----------------------------------------------------------------------===//
185// SSE 1 & 2 - Move FP Scalar Instructions
186//
187// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
188// register copies because it's a partial register update; Register-to-register
189// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
190// that the insert be implementable in terms of a copy, and just mentioned, we
191// don't use movss/movsd for copies.
192//===----------------------------------------------------------------------===//
193
194multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
195                         string asm_opr, Domain d, string Name> {
196  let isCommutable = 1 in
197  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
198              (ins VR128:$src1, VR128:$src2),
199              !strconcat(base_opc, asm_opr),
200              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
201              Sched<[SchedWriteFShuffle.XMM]>;
202
203  // For the disassembler
204  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
205  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
206                  (ins VR128:$src1, VR128:$src2),
207                  !strconcat(base_opc, asm_opr), []>,
208                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
209}
210
211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
212                      X86MemOperand x86memop, string OpcodeStr,
213                      Domain d, string Name, Predicate pred> {
214  // AVX
215  let Predicates = [UseAVX, OptForSize] in
216  defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
217                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
218                              "V"#Name>,
219                              VEX_4V, VEX_LIG, VEX_WIG;
220
221  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
222                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
223                     [(store RC:$src, addr:$dst)], d>,
224                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
225  // SSE1 & 2
226  let Constraints = "$src1 = $dst" in {
227    let Predicates = [pred, NoSSE41_Or_OptForSize] in
228    defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
229                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
230  }
231
232  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
233                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
234                     [(store RC:$src, addr:$dst)], d>,
235                     Sched<[WriteFStore]>;
236
237  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
238                  (!cast<Instruction>("V"#NAME#"rr_REV")
239                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
240  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
241                  (!cast<Instruction>(NAME#"rr_REV")
242                   VR128:$dst, VR128:$src2), 0>;
243}
244
245// Loading from memory automatically zeroing upper bits.
246multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
247                         PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
248                         Domain d> {
249  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
250                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
251                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
252                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
253  def NAME#rm   : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
254                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
255                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
256                     Sched<[WriteFLoad]>;
257
258  // _alt version uses FR32/FR64 register class.
259  let isCodeGenOnly = 1 in {
260  def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
261                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
262                         [(set RC:$dst, (mem_pat addr:$src))], d>,
263                         VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
264  def NAME#rm_alt   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
265                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
266                         [(set RC:$dst, (mem_pat addr:$src))], d>,
267                         Sched<[WriteFLoad]>;
268  }
269}
270
271defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
272                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
273defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
274                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
275
276let canFoldAsLoad = 1, isReMaterializable = 1 in {
277  defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
278                             SSEPackedSingle>, XS;
279  defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
280                             SSEPackedDouble>, XD;
281}
282
283// Patterns
284let Predicates = [UseAVX] in {
285  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
286            (VMOVSSrm addr:$src)>;
287  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
288            (VMOVSDrm addr:$src)>;
289
290  // Represent the same patterns above but in the form they appear for
291  // 256-bit types
292  def : Pat<(v8f32 (X86vzload32 addr:$src)),
293            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
294  def : Pat<(v4f64 (X86vzload64 addr:$src)),
295            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
296}
297
298let Predicates = [UseAVX, OptForSize] in {
299  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
300  // MOVSS to the lower bits.
301  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
302            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
303  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
304            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
305
306  // Move low f32 and clear high bits.
307  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
308            (SUBREG_TO_REG (i32 0),
309             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
310              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
311  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
312            (SUBREG_TO_REG (i32 0),
313             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
314              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
315}
316
317let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
318// Move scalar to XMM zero-extended, zeroing a VR128 then do a
319// MOVSS to the lower bits.
320def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
321          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
322def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
323          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
324}
325
326let Predicates = [UseSSE2] in
327def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
328          (MOVSDrm addr:$src)>;
329
330let Predicates = [UseSSE1] in
331def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
332          (MOVSSrm addr:$src)>;
333
334//===----------------------------------------------------------------------===//
335// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
336//===----------------------------------------------------------------------===//
337
338multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
339                            X86MemOperand x86memop, PatFrag ld_frag,
340                            string asm, Domain d,
341                            X86SchedWriteMoveLS sched> {
342let hasSideEffects = 0, isMoveReg = 1 in
343  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
344              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
345           Sched<[sched.RR]>;
346let canFoldAsLoad = 1, isReMaterializable = 1 in
347  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
348              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
349                   [(set RC:$dst, (ld_frag addr:$src))], d>,
350           Sched<[sched.RM]>;
351}
352
353let Predicates = [HasAVX, NoVLX] in {
354defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
355                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
356                                PS, VEX, VEX_WIG;
357defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
358                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
359                                PD, VEX, VEX_WIG;
360defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
361                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
362                                PS, VEX, VEX_WIG;
363defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
364                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
365                                PD, VEX, VEX_WIG;
366
367defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
368                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
369                                 PS, VEX, VEX_L, VEX_WIG;
370defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
371                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
372                                 PD, VEX, VEX_L, VEX_WIG;
373defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
374                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
375                                 PS, VEX, VEX_L, VEX_WIG;
376defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
377                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
378                                 PD, VEX, VEX_L, VEX_WIG;
379}
380
381let Predicates = [UseSSE1] in {
382defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
383                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
384                               PS;
385defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
386                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
387                               PS;
388}
389let Predicates = [UseSSE2] in {
390defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
391                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
392                               PD;
393defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
394                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
395                               PD;
396}
397
398let Predicates = [HasAVX, NoVLX]  in {
399let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
400def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
401                   "movaps\t{$src, $dst|$dst, $src}",
402                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
403                   VEX, VEX_WIG;
404def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
405                   "movapd\t{$src, $dst|$dst, $src}",
406                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
407                   VEX, VEX_WIG;
408def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
409                   "movups\t{$src, $dst|$dst, $src}",
410                   [(store (v4f32 VR128:$src), addr:$dst)]>,
411                   VEX, VEX_WIG;
412def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
413                   "movupd\t{$src, $dst|$dst, $src}",
414                   [(store (v2f64 VR128:$src), addr:$dst)]>,
415                   VEX, VEX_WIG;
416} // SchedRW
417
418let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
419def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
420                   "movaps\t{$src, $dst|$dst, $src}",
421                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
422                   VEX, VEX_L, VEX_WIG;
423def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
424                   "movapd\t{$src, $dst|$dst, $src}",
425                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
426                   VEX, VEX_L, VEX_WIG;
427def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
428                   "movups\t{$src, $dst|$dst, $src}",
429                   [(store (v8f32 VR256:$src), addr:$dst)]>,
430                   VEX, VEX_L, VEX_WIG;
431def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
432                   "movupd\t{$src, $dst|$dst, $src}",
433                   [(store (v4f64 VR256:$src), addr:$dst)]>,
434                   VEX, VEX_L, VEX_WIG;
435} // SchedRW
436} // Predicate
437
438// For disassembler
439let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
440    isMoveReg = 1 in {
441let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
442  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
443                          (ins VR128:$src),
444                          "movaps\t{$src, $dst|$dst, $src}", []>,
445                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
446  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
447                           (ins VR128:$src),
448                           "movapd\t{$src, $dst|$dst, $src}", []>,
449                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
450  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
451                           (ins VR128:$src),
452                           "movups\t{$src, $dst|$dst, $src}", []>,
453                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
454  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
455                           (ins VR128:$src),
456                           "movupd\t{$src, $dst|$dst, $src}", []>,
457                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
458} // SchedRW
459
460let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
461  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
462                            (ins VR256:$src),
463                            "movaps\t{$src, $dst|$dst, $src}", []>,
464                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
465  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
466                            (ins VR256:$src),
467                            "movapd\t{$src, $dst|$dst, $src}", []>,
468                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
469  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
470                            (ins VR256:$src),
471                            "movups\t{$src, $dst|$dst, $src}", []>,
472                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
473  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
474                            (ins VR256:$src),
475                            "movupd\t{$src, $dst|$dst, $src}", []>,
476                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
477} // SchedRW
478} // Predicate
479
480// Reversed version with ".s" suffix for GAS compatibility.
481def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
482                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
483def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
484                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
485def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
486                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
487def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
488                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
489def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
490                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
491def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
492                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
493def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
494                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
495def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
496                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
497
498let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
499def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
500                   "movaps\t{$src, $dst|$dst, $src}",
501                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
502def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
503                   "movapd\t{$src, $dst|$dst, $src}",
504                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
505def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
506                   "movups\t{$src, $dst|$dst, $src}",
507                   [(store (v4f32 VR128:$src), addr:$dst)]>;
508def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
509                   "movupd\t{$src, $dst|$dst, $src}",
510                   [(store (v2f64 VR128:$src), addr:$dst)]>;
511} // SchedRW
512
513// For disassembler
514let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
515    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
516  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
517                         "movaps\t{$src, $dst|$dst, $src}", []>,
518                         FoldGenData<"MOVAPSrr">;
519  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520                         "movapd\t{$src, $dst|$dst, $src}", []>,
521                         FoldGenData<"MOVAPDrr">;
522  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
523                         "movups\t{$src, $dst|$dst, $src}", []>,
524                         FoldGenData<"MOVUPSrr">;
525  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
526                         "movupd\t{$src, $dst|$dst, $src}", []>,
527                         FoldGenData<"MOVUPDrr">;
528}
529
530// Reversed version with ".s" suffix for GAS compatibility.
531def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
532                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
533def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
534                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
535def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
536                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
537def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
538                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
539
540let Predicates = [HasAVX, NoVLX] in {
541  // 256-bit load/store need to use floating point load/store in case we don't
542  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
543  // available and changing the domain is beneficial.
544  def : Pat<(alignedloadv4i64 addr:$src),
545            (VMOVAPSYrm addr:$src)>;
546  def : Pat<(alignedloadv8i32 addr:$src),
547            (VMOVAPSYrm addr:$src)>;
548  def : Pat<(alignedloadv16i16 addr:$src),
549            (VMOVAPSYrm addr:$src)>;
550  def : Pat<(alignedloadv32i8 addr:$src),
551            (VMOVAPSYrm addr:$src)>;
552  def : Pat<(loadv4i64 addr:$src),
553            (VMOVUPSYrm addr:$src)>;
554  def : Pat<(loadv8i32 addr:$src),
555            (VMOVUPSYrm addr:$src)>;
556  def : Pat<(loadv16i16 addr:$src),
557            (VMOVUPSYrm addr:$src)>;
558  def : Pat<(loadv32i8 addr:$src),
559            (VMOVUPSYrm addr:$src)>;
560
561  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
562            (VMOVAPSYmr addr:$dst, VR256:$src)>;
563  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
564            (VMOVAPSYmr addr:$dst, VR256:$src)>;
565  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
566            (VMOVAPSYmr addr:$dst, VR256:$src)>;
567  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
568            (VMOVAPSYmr addr:$dst, VR256:$src)>;
569  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
570            (VMOVUPSYmr addr:$dst, VR256:$src)>;
571  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
572            (VMOVUPSYmr addr:$dst, VR256:$src)>;
573  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
574            (VMOVUPSYmr addr:$dst, VR256:$src)>;
575  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
576            (VMOVUPSYmr addr:$dst, VR256:$src)>;
577
578  def : Pat<(alignedloadv8f16 addr:$src),
579            (VMOVAPSrm addr:$src)>;
580  def : Pat<(loadv8f16 addr:$src),
581            (VMOVUPSrm addr:$src)>;
582  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
583            (VMOVAPSmr addr:$dst, VR128:$src)>;
584  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
585            (VMOVUPSmr addr:$dst, VR128:$src)>;
586  def : Pat<(alignedloadv16f16 addr:$src),
587            (VMOVAPSYrm addr:$src)>;
588  def : Pat<(loadv16f16 addr:$src),
589            (VMOVUPSYrm addr:$src)>;
590  def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
591            (VMOVAPSYmr addr:$dst, VR256:$src)>;
592  def : Pat<(store (v16f16 VR256:$src), addr:$dst),
593            (VMOVUPSYmr addr:$dst, VR256:$src)>;
594}
595
596// Use movaps / movups for SSE integer load / store (one byte shorter).
597// The instructions selected below are then converted to MOVDQA/MOVDQU
598// during the SSE domain pass.
599let Predicates = [UseSSE1] in {
600  def : Pat<(alignedloadv2i64 addr:$src),
601            (MOVAPSrm addr:$src)>;
602  def : Pat<(alignedloadv4i32 addr:$src),
603            (MOVAPSrm addr:$src)>;
604  def : Pat<(alignedloadv8i16 addr:$src),
605            (MOVAPSrm addr:$src)>;
606  def : Pat<(alignedloadv16i8 addr:$src),
607            (MOVAPSrm addr:$src)>;
608  def : Pat<(loadv2i64 addr:$src),
609            (MOVUPSrm addr:$src)>;
610  def : Pat<(loadv4i32 addr:$src),
611            (MOVUPSrm addr:$src)>;
612  def : Pat<(loadv8i16 addr:$src),
613            (MOVUPSrm addr:$src)>;
614  def : Pat<(loadv16i8 addr:$src),
615            (MOVUPSrm addr:$src)>;
616
617  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
618            (MOVAPSmr addr:$dst, VR128:$src)>;
619  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
620            (MOVAPSmr addr:$dst, VR128:$src)>;
621  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
622            (MOVAPSmr addr:$dst, VR128:$src)>;
623  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
624            (MOVAPSmr addr:$dst, VR128:$src)>;
625  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
626            (MOVUPSmr addr:$dst, VR128:$src)>;
627  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
628            (MOVUPSmr addr:$dst, VR128:$src)>;
629  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
630            (MOVUPSmr addr:$dst, VR128:$src)>;
631  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
632            (MOVUPSmr addr:$dst, VR128:$src)>;
633}
634
635let Predicates = [UseSSE2] in {
636  def : Pat<(alignedloadv8f16 addr:$src),
637            (MOVAPSrm addr:$src)>;
638  def : Pat<(loadv8f16 addr:$src),
639            (MOVUPSrm addr:$src)>;
640  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
641            (MOVAPSmr addr:$dst, VR128:$src)>;
642  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
643            (MOVUPSmr addr:$dst, VR128:$src)>;
644}
645
646//===----------------------------------------------------------------------===//
647// SSE 1 & 2 - Move Low packed FP Instructions
648//===----------------------------------------------------------------------===//
649
650multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
651                                      string base_opc, string asm_opr> {
652  // No pattern as they need be special cased between high and low.
653  let hasSideEffects = 0, mayLoad = 1 in
654  def PSrm : PI<opc, MRMSrcMem,
655                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
656                !strconcat(base_opc, "s", asm_opr),
657                [], SSEPackedSingle>, PS,
658                Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
659
660  def PDrm : PI<opc, MRMSrcMem,
661         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
662         !strconcat(base_opc, "d", asm_opr),
663     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
664                              (scalar_to_vector (loadf64 addr:$src2)))))],
665              SSEPackedDouble>, PD,
666     Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
667}
668
669multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
670                                 string base_opc> {
671  let Predicates = [UseAVX] in
672    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
673                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
674                                    VEX_4V, VEX_WIG;
675
676  let Constraints = "$src1 = $dst" in
677    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
678                                    "\t{$src2, $dst|$dst, $src2}">;
679}
680
681defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
682
683let SchedRW = [WriteFStore] in {
684let Predicates = [UseAVX] in {
685let mayStore = 1, hasSideEffects = 0 in
686def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
687                     "movlps\t{$src, $dst|$dst, $src}",
688                     []>,
689                     VEX, VEX_WIG;
690def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
691                     "movlpd\t{$src, $dst|$dst, $src}",
692                     [(store (f64 (extractelt (v2f64 VR128:$src),
693                                   (iPTR 0))), addr:$dst)]>,
694                     VEX, VEX_WIG;
695}// UseAVX
696let mayStore = 1, hasSideEffects = 0 in
697def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
698                   "movlps\t{$src, $dst|$dst, $src}",
699                   []>;
700def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
701                   "movlpd\t{$src, $dst|$dst, $src}",
702                   [(store (f64 (extractelt (v2f64 VR128:$src),
703                                 (iPTR 0))), addr:$dst)]>;
704} // SchedRW
705
706let Predicates = [UseSSE1] in {
707  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
708  // end up with a movsd or blend instead of shufp.
709  // No need for aligned load, we're only loading 64-bits.
710  def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
711                      (i8 -28)),
712            (MOVLPSrm VR128:$src1, addr:$src2)>;
713  def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
714            (MOVLPSrm VR128:$src1, addr:$src2)>;
715
716  def : Pat<(v4f32 (X86vzload64 addr:$src)),
717            (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
718  def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
719            (MOVLPSmr addr:$dst, VR128:$src)>;
720}
721
722//===----------------------------------------------------------------------===//
723// SSE 1 & 2 - Move Hi packed FP Instructions
724//===----------------------------------------------------------------------===//
725
726defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
727
728let SchedRW = [WriteFStore] in {
729// v2f64 extract element 1 is always custom lowered to unpack high to low
730// and extract element 0 so the non-store version isn't too horrible.
731let Predicates = [UseAVX] in {
732let mayStore = 1, hasSideEffects = 0 in
733def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
734                   "movhps\t{$src, $dst|$dst, $src}",
735                   []>, VEX, VEX_WIG;
736def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
737                   "movhpd\t{$src, $dst|$dst, $src}",
738                   [(store (f64 (extractelt
739                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
740                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
741} // UseAVX
742let mayStore = 1, hasSideEffects = 0 in
743def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
744                   "movhps\t{$src, $dst|$dst, $src}",
745                   []>;
746def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
747                   "movhpd\t{$src, $dst|$dst, $src}",
748                   [(store (f64 (extractelt
749                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
750                                 (iPTR 0))), addr:$dst)]>;
751} // SchedRW
752
753let Predicates = [UseAVX] in {
754  // MOVHPD patterns
755  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
756            (VMOVHPDrm VR128:$src1, addr:$src2)>;
757
758  def : Pat<(store (f64 (extractelt
759                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
760                          (iPTR 0))), addr:$dst),
761            (VMOVHPDmr addr:$dst, VR128:$src)>;
762
763  // MOVLPD patterns
764  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
765            (VMOVLPDrm VR128:$src1, addr:$src2)>;
766}
767
768let Predicates = [UseSSE1] in {
769  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
770  // end up with a movsd or blend instead of shufp.
771  // No need for aligned load, we're only loading 64-bits.
772  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
773            (MOVHPSrm VR128:$src1, addr:$src2)>;
774  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
775            (MOVHPSrm VR128:$src1, addr:$src2)>;
776
777  def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
778                                addr:$dst),
779            (MOVHPSmr addr:$dst, VR128:$src)>;
780}
781
782let Predicates = [UseSSE2] in {
783  // MOVHPD patterns
784  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
785            (MOVHPDrm VR128:$src1, addr:$src2)>;
786
787  def : Pat<(store (f64 (extractelt
788                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
789                          (iPTR 0))), addr:$dst),
790            (MOVHPDmr addr:$dst, VR128:$src)>;
791
792  // MOVLPD patterns
793  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
794            (MOVLPDrm VR128:$src1, addr:$src2)>;
795}
796
797let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
798  // Use MOVLPD to load into the low bits from a full vector unless we can use
799  // BLENDPD.
800  def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
801            (MOVLPDrm VR128:$src1, addr:$src2)>;
802}
803
804//===----------------------------------------------------------------------===//
805// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
806//===----------------------------------------------------------------------===//
807
808let Predicates = [UseAVX] in {
809  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
810                                       (ins VR128:$src1, VR128:$src2),
811                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
812                      [(set VR128:$dst,
813                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
814                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
815  let isCommutable = 1 in
816  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
817                                       (ins VR128:$src1, VR128:$src2),
818                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
819                      [(set VR128:$dst,
820                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
821                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
822                      NotMemoryFoldable;
823}
824let Constraints = "$src1 = $dst" in {
825  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
826                                       (ins VR128:$src1, VR128:$src2),
827                      "movlhps\t{$src2, $dst|$dst, $src2}",
828                      [(set VR128:$dst,
829                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
830                      Sched<[SchedWriteFShuffle.XMM]>;
831  let isCommutable = 1 in
832  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
833                                       (ins VR128:$src1, VR128:$src2),
834                      "movhlps\t{$src2, $dst|$dst, $src2}",
835                      [(set VR128:$dst,
836                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
837                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
838}
839
840//===----------------------------------------------------------------------===//
841// SSE 1 & 2 - Conversion Instructions
842//===----------------------------------------------------------------------===//
843
844multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
845                     SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
846                     string asm, string mem, X86FoldableSchedWrite sched,
847                     Domain d,
848                     SchedRead Int2Fpu = ReadDefault> {
849  let ExeDomain = d in {
850  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
851              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
852              [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
853              Sched<[sched, Int2Fpu]>;
854  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
855              mem#"\t{$src, $dst|$dst, $src}",
856              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
857              Sched<[sched.Folded]>;
858  }
859}
860
861multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
862                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
863                       string asm, Domain d, X86FoldableSchedWrite sched> {
864let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
865  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
866             [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
867             Sched<[sched]>;
868  let mayLoad = 1 in
869  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
870             [(set RC:$dst, (DstTy (any_sint_to_fp
871                                    (SrcTy (ld_frag addr:$src)))))], d>,
872             Sched<[sched.Folded]>;
873}
874}
875
876multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
877                          X86MemOperand x86memop, string asm, string mem,
878                          X86FoldableSchedWrite sched, Domain d> {
879let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
880  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
881              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
882              Sched<[sched, ReadDefault, ReadInt2Fpu]>;
883  let mayLoad = 1 in
884  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
885              (ins DstRC:$src1, x86memop:$src),
886              asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
887           Sched<[sched.Folded, sched.ReadAfterFold]>;
888} // hasSideEffects = 0
889}
890
891let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
892defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
893                                "cvttss2si", "cvttss2si",
894                                WriteCvtSS2I, SSEPackedSingle>,
895                                XS, VEX, VEX_LIG;
896defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
897                                "cvttss2si", "cvttss2si",
898                                WriteCvtSS2I, SSEPackedSingle>,
899                                XS, VEX, VEX_W, VEX_LIG;
900defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
901                                "cvttsd2si", "cvttsd2si",
902                                WriteCvtSD2I, SSEPackedDouble>,
903                                XD, VEX, VEX_LIG;
904defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
905                                "cvttsd2si", "cvttsd2si",
906                                WriteCvtSD2I, SSEPackedDouble>,
907                                XD, VEX, VEX_W, VEX_LIG;
908
909defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
910                               "cvtss2si", "cvtss2si",
911                               WriteCvtSS2I, SSEPackedSingle>,
912                               XS, VEX, VEX_LIG;
913defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
914                               "cvtss2si", "cvtss2si",
915                               WriteCvtSS2I, SSEPackedSingle>,
916                               XS, VEX, VEX_W, VEX_LIG;
917defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
918                               "cvtsd2si", "cvtsd2si",
919                               WriteCvtSD2I, SSEPackedDouble>,
920                               XD, VEX, VEX_LIG;
921defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
922                               "cvtsd2si", "cvtsd2si",
923                               WriteCvtSD2I, SSEPackedDouble>,
924                               XD, VEX, VEX_W, VEX_LIG;
925}
926
927// The assembler can recognize rr 64-bit instructions by seeing a rxx
928// register, but the same isn't true when only using memory operands,
929// provide other assembly "l" and "q" forms to address this explicitly
930// where appropriate to do so.
931let isCodeGenOnly = 1 in {
932defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
933                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
934                                  VEX_LIG, SIMD_EXC;
935defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
936                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
937                                  VEX_W, VEX_LIG, SIMD_EXC;
938defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
939                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
940                                  VEX_LIG;
941defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
942                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
943                                  VEX_W, VEX_LIG, SIMD_EXC;
944} // isCodeGenOnly = 1
945
946let Predicates = [UseAVX] in {
947  def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
948            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
949  def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
950            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
951  def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
952            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
953  def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
954            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
955
956  def : Pat<(f32 (any_sint_to_fp GR32:$src)),
957            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
958  def : Pat<(f32 (any_sint_to_fp GR64:$src)),
959            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
960  def : Pat<(f64 (any_sint_to_fp GR32:$src)),
961            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
962  def : Pat<(f64 (any_sint_to_fp GR64:$src)),
963            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
964
965  def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
966  def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
967
968  def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
969  def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
970}
971
972let isCodeGenOnly = 1 in {
973defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
974                      "cvttss2si", "cvttss2si",
975                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
976defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
977                      "cvttss2si", "cvttss2si",
978                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
979defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
980                      "cvttsd2si", "cvttsd2si",
981                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
982defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
983                      "cvttsd2si", "cvttsd2si",
984                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
985
986defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
987                     "cvtss2si", "cvtss2si",
988                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
989defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
990                     "cvtss2si", "cvtss2si",
991                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
992defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
993                     "cvtsd2si", "cvtsd2si",
994                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
995defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
996                     "cvtsd2si", "cvtsd2si",
997                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
998
999defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
1000                      "cvtsi2ss", "cvtsi2ss{l}",
1001                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
1002defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
1003                      "cvtsi2ss", "cvtsi2ss{q}",
1004                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
1005defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
1006                      "cvtsi2sd", "cvtsi2sd{l}",
1007                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
1008defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
1009                      "cvtsi2sd", "cvtsi2sd{q}",
1010                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
1011} // isCodeGenOnly = 1
1012
1013let Predicates = [UseSSE1] in {
1014  def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
1015  def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
1016}
1017
1018let Predicates = [UseSSE2] in {
1019  def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
1020  def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
1021}
1022
1023// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1024// and/or XMM operand(s).
1025
1026multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1027                          ValueType DstVT, ValueType SrcVT, SDNode OpNode,
1028                          Operand memop, PatFrags mem_frags, string asm,
1029                          X86FoldableSchedWrite sched, Domain d> {
1030let ExeDomain = d in {
1031  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1032                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1033                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1034               Sched<[sched]>;
1035  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1036                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1037                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
1038               Sched<[sched.Folded]>;
1039}
1040}
1041
1042multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1043                    RegisterClass DstRC, X86MemOperand x86memop,
1044                    string asm, string mem, X86FoldableSchedWrite sched,
1045                    Domain d, bit Is2Addr = 1> {
1046let hasSideEffects = 0, ExeDomain = d in {
1047  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1048                  !if(Is2Addr,
1049                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1050                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1051                  []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1052  let mayLoad = 1 in
1053  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1054                  (ins DstRC:$src1, x86memop:$src2),
1055                  !if(Is2Addr,
1056                      asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
1057                      asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
1058                  []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1059}
1060}
1061
1062let Uses = [MXCSR], mayRaiseFPException = 1 in {
1063let Predicates = [UseAVX] in {
1064defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1065                  X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1066                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1067defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1068                    X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1069                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
1070}
1071defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1072                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1073                 SSEPackedDouble>, XD;
1074defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1075                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1076                   SSEPackedDouble>, XD, REX_W;
1077}
1078
1079let Predicates = [UseAVX] in {
1080defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1081          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1082          XS, VEX_4V, VEX_LIG, SIMD_EXC;
1083defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1084          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1085          XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1086defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1087          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1088          XD, VEX_4V, VEX_LIG;
1089defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1090          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1091          XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1092}
1093let Constraints = "$src1 = $dst" in {
1094  defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1095                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1096                        XS, SIMD_EXC;
1097  defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1098                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1099                        XS, REX_W, SIMD_EXC;
1100  defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1101                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1102                        XD;
1103  defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1104                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1105                        XD, REX_W, SIMD_EXC;
1106}
1107
1108def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1109               (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1110def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1111               (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1112def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1113               (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1114def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1115               (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1116
1117def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1118              (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1119def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1120              (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1121
1122def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1123                (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1124def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1125                (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1126def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1127                (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1128def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1129                (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1130
1131def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1132                (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1133def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1134                (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1135
1136/// SSE 1 Only
1137
1138// Aliases for intrinsics
1139let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1140defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1141                                ssmem, sse_load_f32, "cvttss2si",
1142                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1143defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1144                               X86cvtts2Int, ssmem, sse_load_f32,
1145                               "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1146                               XS, VEX, VEX_LIG, VEX_W;
1147defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1148                                sdmem, sse_load_f64, "cvttsd2si",
1149                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1150defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1151                              X86cvtts2Int, sdmem, sse_load_f64,
1152                              "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1153                              XD, VEX, VEX_LIG, VEX_W;
1154}
1155let Uses = [MXCSR], mayRaiseFPException = 1 in {
1156defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1157                                    ssmem, sse_load_f32, "cvttss2si",
1158                                    WriteCvtSS2I, SSEPackedSingle>, XS;
1159defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1160                                   X86cvtts2Int, ssmem, sse_load_f32,
1161                                   "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1162                                   XS, REX_W;
1163defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1164                                    sdmem, sse_load_f64, "cvttsd2si",
1165                                    WriteCvtSD2I, SSEPackedDouble>, XD;
1166defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1167                                  X86cvtts2Int, sdmem, sse_load_f64,
1168                                  "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1169                                  XD, REX_W;
1170}
1171
1172def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1173                (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1174def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1175                (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1176def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1177                (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1178def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1179                (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1180def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1181                (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1182def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1183                (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1184def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1185                (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1186def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1187                (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1188
1189def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1190                (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1191def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1192                (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1193def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1194                (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1195def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1196                (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1197def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1198                (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1199def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1200                (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1201def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1202                (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1203def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1204                (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1205
1206let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1207defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1208                                  ssmem, sse_load_f32, "cvtss2si",
1209                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1210defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1211                                  ssmem, sse_load_f32, "cvtss2si",
1212                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
1213}
1214let Uses = [MXCSR], mayRaiseFPException = 1 in {
1215defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1216                               ssmem, sse_load_f32, "cvtss2si",
1217                               WriteCvtSS2I, SSEPackedSingle>, XS;
1218defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1219                                 ssmem, sse_load_f32, "cvtss2si",
1220                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1221
1222defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1223                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1224                               SSEPackedSingle, WriteCvtI2PS>,
1225                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1226defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1227                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1228                               SSEPackedSingle, WriteCvtI2PSY>,
1229                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1230
1231defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1232                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1233                            SSEPackedSingle, WriteCvtI2PS>,
1234                            PS, Requires<[UseSSE2]>;
1235}
1236
1237// AVX aliases
1238def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1239                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1240def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1241                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1242def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1243                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1244def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1245                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1246def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1247                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1248def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1249                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1250def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1251                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1252def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1253                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1254
1255// SSE aliases
1256def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1257                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1258def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1259                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1260def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1261                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1262def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1263                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1264def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1265                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1266def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1267                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1268def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1269                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1270def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1271                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1272
1273/// SSE 2 Only
1274
1275// Convert scalar double to scalar single
1276let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
1277    ExeDomain = SSEPackedSingle in {
1278def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1279                        (ins FR32:$src1, FR64:$src2),
1280                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1281                        VEX_4V, VEX_LIG, VEX_WIG,
1282                        Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1283let mayLoad = 1 in
1284def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1285                     (ins FR32:$src1, f64mem:$src2),
1286                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1287                     XD, VEX_4V, VEX_LIG, VEX_WIG,
1288                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1289}
1290
1291def : Pat<(f32 (any_fpround FR64:$src)),
1292            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1293          Requires<[UseAVX]>;
1294
1295let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1296def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1297                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1298                      [(set FR32:$dst, (any_fpround FR64:$src))]>,
1299                      Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1300def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1301                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1302                    [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1303                    XD, Requires<[UseSSE2, OptForSize]>,
1304                    Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1305}
1306
1307let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
1308def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1309                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1310                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1311                       [(set VR128:$dst,
1312                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1313                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1314                       Sched<[WriteCvtSD2SS]>;
1315def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1316                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1317                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1318                       [(set VR128:$dst,
1319                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1320                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1321                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1322let Constraints = "$src1 = $dst" in {
1323def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1324                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1325                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1326                       [(set VR128:$dst,
1327                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1328                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1329def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1330                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1331                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1332                       [(set VR128:$dst,
1333                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1334                       XD, Requires<[UseSSE2]>,
1335                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1336}
1337}
1338
1339// Convert scalar single to scalar double
1340// SSE2 instructions with XS prefix
1341let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
1342def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1343                    (ins FR64:$src1, FR32:$src2),
1344                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1345                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1346                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1347let mayLoad = 1 in
1348def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1349                    (ins FR64:$src1, f32mem:$src2),
1350                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1351                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1352                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1353                    Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1354} // isCodeGenOnly = 1, hasSideEffects = 0
1355
1356def : Pat<(f64 (any_fpextend FR32:$src)),
1357    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1358def : Pat<(any_fpextend (loadf32 addr:$src)),
1359    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1360
1361let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1362def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1363                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1364                   [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1365                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1366def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1367                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1368                   [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1369                   XS, Requires<[UseSSE2, OptForSize]>,
1370                   Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC;
1371} // isCodeGenOnly = 1
1372
1373let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
1374    ExeDomain = SSEPackedSingle in {
1375def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1376                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1377                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1378                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1379                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1380let mayLoad = 1 in
1381def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1382                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1383                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1384                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1385                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1386let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1387def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1388                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1389                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1390                    []>, XS, Requires<[UseSSE2]>,
1391                    Sched<[WriteCvtSS2SD]>;
1392let mayLoad = 1 in
1393def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1394                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1395                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1396                    []>, XS, Requires<[UseSSE2]>,
1397                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1398}
1399} // hasSideEffects = 0
1400
1401// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1402// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1403// vmovs{s,d} instructions
1404let Predicates = [UseAVX] in {
1405def : Pat<(v4f32 (X86Movss
1406                   (v4f32 VR128:$dst),
1407                   (v4f32 (scalar_to_vector
1408                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1409          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1410
1411def : Pat<(v2f64 (X86Movsd
1412                   (v2f64 VR128:$dst),
1413                   (v2f64 (scalar_to_vector
1414                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1415          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1416
1417def : Pat<(v4f32 (X86Movss
1418                   (v4f32 VR128:$dst),
1419                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1420          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1421
1422def : Pat<(v4f32 (X86Movss
1423                   (v4f32 VR128:$dst),
1424                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1425          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1426
1427def : Pat<(v4f32 (X86Movss
1428                   (v4f32 VR128:$dst),
1429                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1430          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1431
1432def : Pat<(v4f32 (X86Movss
1433                   (v4f32 VR128:$dst),
1434                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1435          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1436
1437def : Pat<(v2f64 (X86Movsd
1438                   (v2f64 VR128:$dst),
1439                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1440          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1441
1442def : Pat<(v2f64 (X86Movsd
1443                   (v2f64 VR128:$dst),
1444                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1445          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1446
1447def : Pat<(v2f64 (X86Movsd
1448                   (v2f64 VR128:$dst),
1449                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1450          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1451
1452def : Pat<(v2f64 (X86Movsd
1453                   (v2f64 VR128:$dst),
1454                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1455          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1456} // Predicates = [UseAVX]
1457
1458let Predicates = [UseSSE2] in {
1459def : Pat<(v4f32 (X86Movss
1460                   (v4f32 VR128:$dst),
1461                   (v4f32 (scalar_to_vector
1462                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1463          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1464
1465def : Pat<(v2f64 (X86Movsd
1466                   (v2f64 VR128:$dst),
1467                   (v2f64 (scalar_to_vector
1468                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1469          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1470
1471def : Pat<(v2f64 (X86Movsd
1472                   (v2f64 VR128:$dst),
1473                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1474          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1475
1476def : Pat<(v2f64 (X86Movsd
1477                   (v2f64 VR128:$dst),
1478                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1479          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1480
1481def : Pat<(v2f64 (X86Movsd
1482                   (v2f64 VR128:$dst),
1483                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1484          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1485
1486def : Pat<(v2f64 (X86Movsd
1487                   (v2f64 VR128:$dst),
1488                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1489          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1490} // Predicates = [UseSSE2]
1491
1492let Predicates = [UseSSE1] in {
1493def : Pat<(v4f32 (X86Movss
1494                   (v4f32 VR128:$dst),
1495                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1496          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1497
1498def : Pat<(v4f32 (X86Movss
1499                   (v4f32 VR128:$dst),
1500                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1501          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1502
1503def : Pat<(v4f32 (X86Movss
1504                   (v4f32 VR128:$dst),
1505                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1506          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1507
1508def : Pat<(v4f32 (X86Movss
1509                   (v4f32 VR128:$dst),
1510                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1511          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1512} // Predicates = [UseSSE1]
1513
1514let Predicates = [HasAVX, NoVLX] in {
1515// Convert packed single/double fp to doubleword
1516def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1517                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1518                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1519                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
1520def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1521                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1522                       [(set VR128:$dst,
1523                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1524                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
1525def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1526                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1527                        [(set VR256:$dst,
1528                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1529                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
1530def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1531                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1532                        [(set VR256:$dst,
1533                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1534                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
1535}
1536def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1537                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1538                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1539                     Sched<[WriteCvtPS2I]>, SIMD_EXC;
1540def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1541                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1542                     [(set VR128:$dst,
1543                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1544                     Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1545
1546
1547// Convert Packed Double FP to Packed DW Integers
1548let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1549// The assembler can recognize rr 256-bit instructions by seeing a ymm
1550// register, but the same isn't true when using memory operands instead.
1551// Provide other assembly rr and rm forms to address this explicitly.
1552def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1553                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1554                       [(set VR128:$dst,
1555                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1556                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1557
1558// XMM only
1559def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1560                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1561                      [(set VR128:$dst,
1562                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1563                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1564
1565// YMM only
1566def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1567                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1568                       [(set VR128:$dst,
1569                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1570                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1571def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1572                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1573                       [(set VR128:$dst,
1574                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1575                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1576}
1577
1578def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1579                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1580def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1581                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1582
1583def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1584                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1585                      [(set VR128:$dst,
1586                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1587                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1588def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1589                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1590                      [(set VR128:$dst,
1591                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1592                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1593
1594// Convert with truncation packed single/double fp to doubleword
1595// SSE2 packed instructions with XS prefix
1596let Uses = [MXCSR], mayRaiseFPException = 1 in {
1597let Predicates = [HasAVX, NoVLX] in {
1598def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1599                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1600                         [(set VR128:$dst,
1601                           (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1602                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1603def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1604                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1605                         [(set VR128:$dst,
1606                           (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1607                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1608def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1609                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1610                          [(set VR256:$dst,
1611                            (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1612                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1613def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1614                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1615                          [(set VR256:$dst,
1616                            (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1617                          VEX, VEX_L,
1618                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1619}
1620
1621def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1622                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1623                       [(set VR128:$dst,
1624                         (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1625                       Sched<[WriteCvtPS2I]>;
1626def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1627                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1628                       [(set VR128:$dst,
1629                         (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1630                       Sched<[WriteCvtPS2ILd]>;
1631}
1632
1633// The assembler can recognize rr 256-bit instructions by seeing a ymm
1634// register, but the same isn't true when using memory operands instead.
1635// Provide other assembly rr and rm forms to address this explicitly.
1636let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1637// XMM only
1638def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1639                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1640                        [(set VR128:$dst,
1641                          (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1642                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1643def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1644                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1645                        [(set VR128:$dst,
1646                          (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1647                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1648
1649// YMM only
1650def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1651                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1652                         [(set VR128:$dst,
1653                           (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1654                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1655def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1656                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1657                         [(set VR128:$dst,
1658                           (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1659                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1660} // Predicates = [HasAVX, NoVLX]
1661
1662def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1663                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1664def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1665                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1666
1667let Predicates = [HasAVX, NoVLX] in {
1668  def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1669            (VCVTTPD2DQYrr VR256:$src)>;
1670  def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1671            (VCVTTPD2DQYrm addr:$src)>;
1672}
1673
1674def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1675                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1676                      [(set VR128:$dst,
1677                        (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1678                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1679def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1680                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1681                      [(set VR128:$dst,
1682                        (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1683                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1684
1685// Convert packed single to packed double
1686let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1687                  // SSE2 instructions without OpSize prefix
1688def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1689                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1690                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1691                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1692def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1693                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1694                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1695                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1696def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1697                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1698                     [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1699                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1700def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1701                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1702                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1703                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1704}
1705
1706let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1707def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1708                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1709                   [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1710                   PS, Sched<[WriteCvtPS2PD]>;
1711def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1712                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1713                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1714                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1715}
1716
1717// Convert Packed DW Integers to Packed Double FP
1718let Predicates = [HasAVX, NoVLX] in {
1719let hasSideEffects = 0, mayLoad = 1 in
1720def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1721                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1722                        [(set VR128:$dst,
1723                          (v2f64 (X86any_VSintToFP
1724                                  (bc_v4i32
1725                                   (v2i64 (scalar_to_vector
1726                                           (loadi64 addr:$src)))))))]>,
1727                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1728def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1729                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1730                        [(set VR128:$dst,
1731                          (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1732                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1733def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1734                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1735                         [(set VR256:$dst,
1736                           (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1737                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1738                         VEX_WIG;
1739def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1740                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1741                         [(set VR256:$dst,
1742                           (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1743                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1744}
1745
1746let hasSideEffects = 0, mayLoad = 1 in
1747def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1748                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1749                       [(set VR128:$dst,
1750                         (v2f64 (X86any_VSintToFP
1751                                 (bc_v4i32
1752                                  (v2i64 (scalar_to_vector
1753                                          (loadi64 addr:$src)))))))]>,
1754                       Sched<[WriteCvtI2PDLd]>;
1755def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1756                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1757                       [(set VR128:$dst,
1758                         (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1759                       Sched<[WriteCvtI2PD]>;
1760
1761// AVX register conversion intrinsics
1762let Predicates = [HasAVX, NoVLX] in {
1763  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1764            (VCVTDQ2PDrm addr:$src)>;
1765} // Predicates = [HasAVX, NoVLX]
1766
1767// SSE2 register conversion intrinsics
1768let Predicates = [UseSSE2] in {
1769  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1770            (CVTDQ2PDrm addr:$src)>;
1771} // Predicates = [UseSSE2]
1772
1773// Convert packed double to packed single
1774// The assembler can recognize rr 256-bit instructions by seeing a ymm
1775// register, but the same isn't true when using memory operands instead.
1776// Provide other assembly rr and rm forms to address this explicitly.
1777let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1778// XMM only
1779def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1780                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1781                       [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1782                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1783def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1784                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1785                       [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
1786                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1787
1788def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1789                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1790                        [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
1791                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1792def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1793                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1794                        [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
1795                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1796} // Predicates = [HasAVX, NoVLX]
1797
1798def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1799                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1800def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1801                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1802
1803def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1804                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1805                     [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1806                     Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1807def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1808                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1809                     [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
1810                     Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1811
1812//===----------------------------------------------------------------------===//
1813// SSE 1 & 2 - Compare Instructions
1814//===----------------------------------------------------------------------===//
1815
1816// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1817multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1818                            Operand memop, SDNode OpNode, ValueType VT,
1819                            PatFrag ld_frag, string asm,
1820                            X86FoldableSchedWrite sched,
1821                            PatFrags mem_frags> {
1822  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1823                    (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
1824                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1825                                              VR128:$src2, timm:$cc))]>,
1826           Sched<[sched]>, SIMD_EXC;
1827  let mayLoad = 1 in
1828  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1829                    (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
1830                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1831                                              (mem_frags addr:$src2), timm:$cc))]>,
1832           Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1833
1834  let isCodeGenOnly = 1 in {
1835    let isCommutable = 1 in
1836    def rr : SIi8<0xC2, MRMSrcReg,
1837                  (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1838                  [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
1839                  Sched<[sched]>, SIMD_EXC;
1840    def rm : SIi8<0xC2, MRMSrcMem,
1841                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1842                  [(set RC:$dst, (OpNode RC:$src1,
1843                                         (ld_frag addr:$src2), timm:$cc))]>,
1844                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1845  }
1846}
1847
1848let ExeDomain = SSEPackedSingle in
1849defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1850                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1851                 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1852                 XS, VEX_4V, VEX_LIG, VEX_WIG;
1853let ExeDomain = SSEPackedDouble in
1854defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1855                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1856                 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1857                 XD, VEX_4V, VEX_LIG, VEX_WIG;
1858
1859let Constraints = "$src1 = $dst" in {
1860  let ExeDomain = SSEPackedSingle in
1861  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1862                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1863                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1864  let ExeDomain = SSEPackedDouble in
1865  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1866                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1867                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1868}
1869
1870// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1871multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
1872                         ValueType vt, X86MemOperand x86memop,
1873                         PatFrag ld_frag, string OpcodeStr, Domain d,
1874                         X86FoldableSchedWrite sched = WriteFComX> {
1875  let ExeDomain = d in {
1876  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1877                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1878                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1879          Sched<[sched]>, SIMD_EXC;
1880  let mayLoad = 1 in
1881  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1882                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1883                     [(set EFLAGS, (OpNode (vt RC:$src1),
1884                                           (ld_frag addr:$src2)))]>,
1885          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1886}
1887}
1888
1889// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1890multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1891                             ValueType vt, Operand memop,
1892                             PatFrags mem_frags, string OpcodeStr,
1893                             Domain d,
1894                             X86FoldableSchedWrite sched = WriteFComX> {
1895let ExeDomain = d in {
1896  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1897                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1898                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1899          Sched<[sched]>, SIMD_EXC;
1900let mayLoad = 1 in
1901  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1902                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1903                     [(set EFLAGS, (OpNode (vt RC:$src1),
1904                                           (mem_frags addr:$src2)))]>,
1905          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1906}
1907}
1908
1909let Defs = [EFLAGS] in {
1910  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1911                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1912  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1913                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1914  defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1915                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1916  defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1917                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1918
1919  let isCodeGenOnly = 1 in {
1920    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1921                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1922    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1923                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1924
1925    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1926                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1927    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1928                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1929  }
1930  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1931                                  "ucomiss", SSEPackedSingle>, PS;
1932  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1933                                  "ucomisd", SSEPackedDouble>, PD;
1934  defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1935                                  "comiss", SSEPackedSingle>, PS;
1936  defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1937                                  "comisd", SSEPackedDouble>, PD;
1938
1939  let isCodeGenOnly = 1 in {
1940    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1941                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1942    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1943                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1944
1945    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1946                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
1947    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1948                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
1949  }
1950} // Defs = [EFLAGS]
1951
1952// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1953multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1954                            ValueType VT, string asm,
1955                            X86FoldableSchedWrite sched,
1956                            Domain d, PatFrag ld_frag> {
1957  let isCommutable = 1 in
1958  def rri : PIi8<0xC2, MRMSrcReg,
1959             (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1960             [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1961            Sched<[sched]>, SIMD_EXC;
1962  def rmi : PIi8<0xC2, MRMSrcMem,
1963             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1964             [(set RC:$dst,
1965               (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1966            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1967}
1968
1969defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1970               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1971               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1972defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1973               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1974               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1975defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1976               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1977               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1978defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1979               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1980               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1981let Constraints = "$src1 = $dst" in {
1982  defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1983                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1984                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1985  defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1986                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1987                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1988}
1989
1990def CommutableCMPCC : PatLeaf<(timm), [{
1991  uint64_t Imm = N->getZExtValue() & 0x7;
1992  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1993}]>;
1994
1995// Patterns to select compares with loads in first operand.
1996let Predicates = [HasAVX] in {
1997  def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
1998                                CommutableCMPCC:$cc)),
1999            (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2000
2001  def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
2002                                CommutableCMPCC:$cc)),
2003            (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2004
2005  def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
2006                                CommutableCMPCC:$cc)),
2007            (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2008
2009  def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
2010                                CommutableCMPCC:$cc)),
2011            (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2012
2013  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2014                          CommutableCMPCC:$cc)),
2015            (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2016
2017  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2018                          CommutableCMPCC:$cc)),
2019            (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2020}
2021
2022let Predicates = [UseSSE2] in {
2023  def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
2024                                CommutableCMPCC:$cc)),
2025            (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2026
2027  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2028                          CommutableCMPCC:$cc)),
2029            (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2030}
2031
2032let Predicates = [UseSSE1] in {
2033  def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
2034                                CommutableCMPCC:$cc)),
2035            (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2036
2037  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2038                          CommutableCMPCC:$cc)),
2039            (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2040}
2041
2042//===----------------------------------------------------------------------===//
2043// SSE 1 & 2 - Shuffle Instructions
2044//===----------------------------------------------------------------------===//
2045
2046/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2047multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2048                         ValueType vt, string asm, PatFrag mem_frag,
2049                         X86FoldableSchedWrite sched, Domain d,
2050                         bit IsCommutable = 0> {
2051  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2052                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2053                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2054                                       (i8 timm:$src3))))], d>,
2055            Sched<[sched.Folded, sched.ReadAfterFold]>;
2056  let isCommutable = IsCommutable in
2057  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2058                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2059                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2060                                     (i8 timm:$src3))))], d>,
2061            Sched<[sched]>;
2062}
2063
2064let Predicates = [HasAVX, NoVLX] in {
2065  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2066           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2067           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2068           PS, VEX_4V, VEX_WIG;
2069  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2070           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2071           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2072           PS, VEX_4V, VEX_L, VEX_WIG;
2073  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2074           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2075           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2076           PD, VEX_4V, VEX_WIG;
2077  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2078           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2079           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2080           PD, VEX_4V, VEX_L, VEX_WIG;
2081}
2082let Constraints = "$src1 = $dst" in {
2083  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2084                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2085                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2086  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2087                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2088                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2089}
2090
2091//===----------------------------------------------------------------------===//
2092// SSE 1 & 2 - Unpack FP Instructions
2093//===----------------------------------------------------------------------===//
2094
2095/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2096multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2097                                   PatFrag mem_frag, RegisterClass RC,
2098                                   X86MemOperand x86memop, string asm,
2099                                   X86FoldableSchedWrite sched, Domain d,
2100                                   bit IsCommutable = 0> {
2101    let isCommutable = IsCommutable in
2102    def rr : PI<opc, MRMSrcReg,
2103                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2104                asm, [(set RC:$dst,
2105                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2106                Sched<[sched]>;
2107    def rm : PI<opc, MRMSrcMem,
2108                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2109                asm, [(set RC:$dst,
2110                           (vt (OpNode RC:$src1,
2111                                       (mem_frag addr:$src2))))], d>,
2112             Sched<[sched.Folded, sched.ReadAfterFold]>;
2113}
2114
2115let Predicates = [HasAVX, NoVLX] in {
2116defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2117      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2118                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2119defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2120      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2121                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2122defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2123      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2124                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2125defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2126      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2127                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2128
2129defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2130      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2131                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2132defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2133      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2134                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2135defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2136      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2137                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2138defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2139      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2140                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2141}// Predicates = [HasAVX, NoVLX]
2142
2143let Constraints = "$src1 = $dst" in {
2144  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2145        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2146                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2147  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2148        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2149                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2150  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2151        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2152                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2153  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2154        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2155                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2156} // Constraints = "$src1 = $dst"
2157
2158let Predicates = [HasAVX1Only] in {
2159  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2160            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2161  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2162            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2163  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2164            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2165  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2166            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2167
2168  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2169            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2170  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2171            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2172  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2173            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2174  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2175            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2176}
2177
2178let Predicates = [UseSSE2] in {
2179  // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2180  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2181                              (v2f64 (simple_load addr:$src2)))),
2182            (MOVHPDrm VR128:$src1, addr:$src2)>;
2183}
2184
2185//===----------------------------------------------------------------------===//
2186// SSE 1 & 2 - Extract Floating-Point Sign mask
2187//===----------------------------------------------------------------------===//
2188
2189/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2190multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2191                                string asm, Domain d> {
2192  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2193              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2194              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2195              Sched<[WriteFMOVMSK]>;
2196}
2197
2198let Predicates = [HasAVX] in {
2199  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2200                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
2201  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2202                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
2203  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2204                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2205  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2206                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2207
2208  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2209  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2210            (VMOVMSKPSrr VR128:$src)>;
2211  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2212            (VMOVMSKPDrr VR128:$src)>;
2213  def : Pat<(X86movmsk (v8i32 VR256:$src)),
2214            (VMOVMSKPSYrr VR256:$src)>;
2215  def : Pat<(X86movmsk (v4i64 VR256:$src)),
2216            (VMOVMSKPDYrr VR256:$src)>;
2217}
2218
2219defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2220                                     SSEPackedSingle>, PS;
2221defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2222                                     SSEPackedDouble>, PD;
2223
2224let Predicates = [UseSSE2] in {
2225  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2226  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2227            (MOVMSKPSrr VR128:$src)>;
2228  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2229            (MOVMSKPDrr VR128:$src)>;
2230}
2231
2232//===---------------------------------------------------------------------===//
2233// SSE2 - Packed Integer Logical Instructions
2234//===---------------------------------------------------------------------===//
2235
2236let ExeDomain = SSEPackedInt in { // SSE integer instructions
2237
2238/// PDI_binop_rm - Simple SSE2 binary operator.
2239multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2240                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2241                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2242                        bit IsCommutable, bit Is2Addr> {
2243  let isCommutable = IsCommutable in
2244  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2245       (ins RC:$src1, RC:$src2),
2246       !if(Is2Addr,
2247           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2248           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2249       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2250       Sched<[sched]>;
2251  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2252       (ins RC:$src1, x86memop:$src2),
2253       !if(Is2Addr,
2254           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2255           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2256       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2257       Sched<[sched.Folded, sched.ReadAfterFold]>;
2258}
2259} // ExeDomain = SSEPackedInt
2260
2261multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2262                         ValueType OpVT128, ValueType OpVT256,
2263                         X86SchedWriteWidths sched, bit IsCommutable,
2264                         Predicate prd> {
2265let Predicates = [HasAVX, prd] in
2266  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2267                             VR128, load, i128mem, sched.XMM,
2268                             IsCommutable, 0>, VEX_4V, VEX_WIG;
2269
2270let Constraints = "$src1 = $dst" in
2271  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2272                           memop, i128mem, sched.XMM, IsCommutable, 1>;
2273
2274let Predicates = [HasAVX2, prd] in
2275  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2276                               OpVT256, VR256, load, i256mem, sched.YMM,
2277                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2278}
2279
2280// These are ordered here for pattern ordering requirements with the fp versions
2281
2282defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2283                           SchedWriteVecLogic, 1, NoVLX>;
2284defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2285                           SchedWriteVecLogic, 1, NoVLX>;
2286defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2287                           SchedWriteVecLogic, 1, NoVLX>;
2288defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2289                           SchedWriteVecLogic, 0, NoVLX>;
2290
2291//===----------------------------------------------------------------------===//
2292// SSE 1 & 2 - Logical Instructions
2293//===----------------------------------------------------------------------===//
2294
2295/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2296///
2297/// There are no patterns here because isel prefers integer versions for SSE2
2298/// and later. There are SSE1 v4f32 patterns later.
2299multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2300                                   X86SchedWriteWidths sched> {
2301  let Predicates = [HasAVX, NoVLX] in {
2302  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2303        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2304        [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2305
2306  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2307        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2308        [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2309
2310  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2311       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2312       [], [], 0>, PS, VEX_4V, VEX_WIG;
2313
2314  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2315       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2316       [], [], 0>, PD, VEX_4V, VEX_WIG;
2317  }
2318
2319  let Constraints = "$src1 = $dst" in {
2320    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2321         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2322         [], []>, PS;
2323
2324    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2325         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2326         [], []>, PD;
2327  }
2328}
2329
2330defm AND  : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
2331defm OR   : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
2332defm XOR  : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
2333let isCommutable = 0 in
2334  defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
2335
2336let Predicates = [HasAVX2, NoVLX] in {
2337  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2338            (VPANDYrr VR256:$src1, VR256:$src2)>;
2339  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2340            (VPANDYrr VR256:$src1, VR256:$src2)>;
2341  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2342            (VPANDYrr VR256:$src1, VR256:$src2)>;
2343
2344  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2345            (VPORYrr VR256:$src1, VR256:$src2)>;
2346  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2347            (VPORYrr VR256:$src1, VR256:$src2)>;
2348  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2349            (VPORYrr VR256:$src1, VR256:$src2)>;
2350
2351  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2352            (VPXORYrr VR256:$src1, VR256:$src2)>;
2353  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2354            (VPXORYrr VR256:$src1, VR256:$src2)>;
2355  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2356            (VPXORYrr VR256:$src1, VR256:$src2)>;
2357
2358  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2359            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2360  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2361            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2362  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2363            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2364
2365  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2366            (VPANDYrm VR256:$src1, addr:$src2)>;
2367  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2368            (VPANDYrm VR256:$src1, addr:$src2)>;
2369  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2370            (VPANDYrm VR256:$src1, addr:$src2)>;
2371
2372  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2373            (VPORYrm VR256:$src1, addr:$src2)>;
2374  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2375            (VPORYrm VR256:$src1, addr:$src2)>;
2376  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2377            (VPORYrm VR256:$src1, addr:$src2)>;
2378
2379  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2380            (VPXORYrm VR256:$src1, addr:$src2)>;
2381  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2382            (VPXORYrm VR256:$src1, addr:$src2)>;
2383  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2384            (VPXORYrm VR256:$src1, addr:$src2)>;
2385
2386  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2387            (VPANDNYrm VR256:$src1, addr:$src2)>;
2388  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2389            (VPANDNYrm VR256:$src1, addr:$src2)>;
2390  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2391            (VPANDNYrm VR256:$src1, addr:$src2)>;
2392}
2393
2394// If only AVX1 is supported, we need to handle integer operations with
2395// floating point instructions since the integer versions aren't available.
2396let Predicates = [HasAVX1Only] in {
2397  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2398            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2399  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2400            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2401  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2402            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2403  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2404            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2405
2406  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2407            (VORPSYrr VR256:$src1, VR256:$src2)>;
2408  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2409            (VORPSYrr VR256:$src1, VR256:$src2)>;
2410  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2411            (VORPSYrr VR256:$src1, VR256:$src2)>;
2412  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2413            (VORPSYrr VR256:$src1, VR256:$src2)>;
2414
2415  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2416            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2417  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2418            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2419  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2420            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2421  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2422            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2423
2424  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2425            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2426  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2427            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2428  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2429            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2430  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2431            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2432
2433  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2434            (VANDPSYrm VR256:$src1, addr:$src2)>;
2435  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2436            (VANDPSYrm VR256:$src1, addr:$src2)>;
2437  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2438            (VANDPSYrm VR256:$src1, addr:$src2)>;
2439  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2440            (VANDPSYrm VR256:$src1, addr:$src2)>;
2441
2442  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2443            (VORPSYrm VR256:$src1, addr:$src2)>;
2444  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2445            (VORPSYrm VR256:$src1, addr:$src2)>;
2446  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2447            (VORPSYrm VR256:$src1, addr:$src2)>;
2448  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2449            (VORPSYrm VR256:$src1, addr:$src2)>;
2450
2451  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2452            (VXORPSYrm VR256:$src1, addr:$src2)>;
2453  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2454            (VXORPSYrm VR256:$src1, addr:$src2)>;
2455  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2456            (VXORPSYrm VR256:$src1, addr:$src2)>;
2457  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2458            (VXORPSYrm VR256:$src1, addr:$src2)>;
2459
2460  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2461            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2462  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2463            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2464  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2465            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2466  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2467            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2468}
2469
2470let Predicates = [HasAVX, NoVLX] in {
2471  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2472            (VPANDrr VR128:$src1, VR128:$src2)>;
2473  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2474            (VPANDrr VR128:$src1, VR128:$src2)>;
2475  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2476            (VPANDrr VR128:$src1, VR128:$src2)>;
2477
2478  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2479            (VPORrr VR128:$src1, VR128:$src2)>;
2480  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2481            (VPORrr VR128:$src1, VR128:$src2)>;
2482  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2483            (VPORrr VR128:$src1, VR128:$src2)>;
2484
2485  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2486            (VPXORrr VR128:$src1, VR128:$src2)>;
2487  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2488            (VPXORrr VR128:$src1, VR128:$src2)>;
2489  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2490            (VPXORrr VR128:$src1, VR128:$src2)>;
2491
2492  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2493            (VPANDNrr VR128:$src1, VR128:$src2)>;
2494  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2495            (VPANDNrr VR128:$src1, VR128:$src2)>;
2496  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2497            (VPANDNrr VR128:$src1, VR128:$src2)>;
2498
2499  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2500            (VPANDrm VR128:$src1, addr:$src2)>;
2501  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2502            (VPANDrm VR128:$src1, addr:$src2)>;
2503  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2504            (VPANDrm VR128:$src1, addr:$src2)>;
2505
2506  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2507            (VPORrm VR128:$src1, addr:$src2)>;
2508  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2509            (VPORrm VR128:$src1, addr:$src2)>;
2510  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2511            (VPORrm VR128:$src1, addr:$src2)>;
2512
2513  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2514            (VPXORrm VR128:$src1, addr:$src2)>;
2515  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2516            (VPXORrm VR128:$src1, addr:$src2)>;
2517  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2518            (VPXORrm VR128:$src1, addr:$src2)>;
2519
2520  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2521            (VPANDNrm VR128:$src1, addr:$src2)>;
2522  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2523            (VPANDNrm VR128:$src1, addr:$src2)>;
2524  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2525            (VPANDNrm VR128:$src1, addr:$src2)>;
2526}
2527
2528let Predicates = [UseSSE2] in {
2529  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2530            (PANDrr VR128:$src1, VR128:$src2)>;
2531  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2532            (PANDrr VR128:$src1, VR128:$src2)>;
2533  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2534            (PANDrr VR128:$src1, VR128:$src2)>;
2535
2536  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2537            (PORrr VR128:$src1, VR128:$src2)>;
2538  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2539            (PORrr VR128:$src1, VR128:$src2)>;
2540  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2541            (PORrr VR128:$src1, VR128:$src2)>;
2542
2543  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2544            (PXORrr VR128:$src1, VR128:$src2)>;
2545  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2546            (PXORrr VR128:$src1, VR128:$src2)>;
2547  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2548            (PXORrr VR128:$src1, VR128:$src2)>;
2549
2550  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2551            (PANDNrr VR128:$src1, VR128:$src2)>;
2552  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2553            (PANDNrr VR128:$src1, VR128:$src2)>;
2554  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2555            (PANDNrr VR128:$src1, VR128:$src2)>;
2556
2557  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2558            (PANDrm VR128:$src1, addr:$src2)>;
2559  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2560            (PANDrm VR128:$src1, addr:$src2)>;
2561  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2562            (PANDrm VR128:$src1, addr:$src2)>;
2563
2564  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2565            (PORrm VR128:$src1, addr:$src2)>;
2566  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2567            (PORrm VR128:$src1, addr:$src2)>;
2568  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2569            (PORrm VR128:$src1, addr:$src2)>;
2570
2571  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2572            (PXORrm VR128:$src1, addr:$src2)>;
2573  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2574            (PXORrm VR128:$src1, addr:$src2)>;
2575  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2576            (PXORrm VR128:$src1, addr:$src2)>;
2577
2578  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2579            (PANDNrm VR128:$src1, addr:$src2)>;
2580  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2581            (PANDNrm VR128:$src1, addr:$src2)>;
2582  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2583            (PANDNrm VR128:$src1, addr:$src2)>;
2584}
2585
2586// Patterns for packed operations when we don't have integer type available.
2587def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2588          (ANDPSrr VR128:$src1, VR128:$src2)>;
2589def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2590          (ORPSrr VR128:$src1, VR128:$src2)>;
2591def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2592          (XORPSrr VR128:$src1, VR128:$src2)>;
2593def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2594          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2595
2596def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2597          (ANDPSrm VR128:$src1, addr:$src2)>;
2598def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2599          (ORPSrm VR128:$src1, addr:$src2)>;
2600def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2601          (XORPSrm VR128:$src1, addr:$src2)>;
2602def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2603          (ANDNPSrm VR128:$src1, addr:$src2)>;
2604
2605//===----------------------------------------------------------------------===//
2606// SSE 1 & 2 - Arithmetic Instructions
2607//===----------------------------------------------------------------------===//
2608
2609/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2610/// vector forms.
2611///
2612/// In addition, we also have a special variant of the scalar form here to
2613/// represent the associated intrinsic operation.  This form is unlike the
2614/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2615/// and leaves the top elements unmodified (therefore these cannot be commuted).
2616///
2617/// These three forms can each be reg+reg or reg+mem.
2618///
2619
2620/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2621/// classes below
2622multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2623                                  SDPatternOperator OpNode, X86SchedWriteSizes sched> {
2624let Uses = [MXCSR], mayRaiseFPException = 1 in {
2625  let Predicates = [HasAVX, NoVLX] in {
2626  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2627                               VR128, v4f32, f128mem, loadv4f32,
2628                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2629  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2630                               VR128, v2f64, f128mem, loadv2f64,
2631                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2632
2633  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2634                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2635                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2636  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2637                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2638                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2639  }
2640
2641  let Constraints = "$src1 = $dst" in {
2642    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2643                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2644                              sched.PS.XMM>, PS;
2645    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2646                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2647                              sched.PD.XMM>, PD;
2648  }
2649}
2650}
2651
2652multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2653                                  X86SchedWriteSizes sched> {
2654let Uses = [MXCSR], mayRaiseFPException = 1 in {
2655  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2656                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2657                         XS, VEX_4V, VEX_LIG, VEX_WIG;
2658  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2659                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2660                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2661
2662  let Constraints = "$src1 = $dst" in {
2663    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2664                              OpNode, FR32, f32mem, SSEPackedSingle,
2665                              sched.PS.Scl>, XS;
2666    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2667                              OpNode, FR64, f64mem, SSEPackedDouble,
2668                              sched.PD.Scl>, XD;
2669  }
2670}
2671}
2672
2673multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2674                                      SDPatternOperator OpNode,
2675                                      X86SchedWriteSizes sched> {
2676let Uses = [MXCSR], mayRaiseFPException = 1 in {
2677  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2678                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2679                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2680  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2681                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2682                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2683
2684  let Constraints = "$src1 = $dst" in {
2685    defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2686                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2687                   SSEPackedSingle, sched.PS.Scl>, XS;
2688    defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2689                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2690                   SSEPackedDouble, sched.PD.Scl>, XD;
2691  }
2692}
2693}
2694
2695// Binary Arithmetic instructions
2696defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2697           basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2698           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2699defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2700           basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2701           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2702let isCommutable = 0 in {
2703  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2704             basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2705             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2706  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2707             basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2708             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2709  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2710             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2711             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2712  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2713             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2714             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2715}
2716
2717let isCodeGenOnly = 1 in {
2718  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2719             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2720  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2721             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2722}
2723
2724// Patterns used to select SSE scalar fp arithmetic instructions from
2725// either:
2726//
2727// (1) a scalar fp operation followed by a blend
2728//
2729// The effect is that the backend no longer emits unnecessary vector
2730// insert instructions immediately after SSE scalar fp instructions
2731// like addss or mulss.
2732//
2733// For example, given the following code:
2734//   __m128 foo(__m128 A, __m128 B) {
2735//     A[0] += B[0];
2736//     return A;
2737//   }
2738//
2739// Previously we generated:
2740//   addss %xmm0, %xmm1
2741//   movss %xmm1, %xmm0
2742//
2743// We now generate:
2744//   addss %xmm1, %xmm0
2745//
2746// (2) a vector packed single/double fp operation followed by a vector insert
2747//
2748// The effect is that the backend converts the packed fp instruction
2749// followed by a vector insert into a single SSE scalar fp instruction.
2750//
2751// For example, given the following code:
2752//   __m128 foo(__m128 A, __m128 B) {
2753//     __m128 C = A + B;
2754//     return (__m128) {c[0], a[1], a[2], a[3]};
2755//   }
2756//
2757// Previously we generated:
2758//   addps %xmm0, %xmm1
2759//   movss %xmm1, %xmm0
2760//
2761// We now generate:
2762//   addss %xmm1, %xmm0
2763
2764// TODO: Some canonicalization in lowering would simplify the number of
2765// patterns we have to try to match.
2766multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
2767                                ValueType VT, ValueType EltTy,
2768                                RegisterClass RC, PatFrag ld_frag,
2769                                Predicate BasePredicate> {
2770  let Predicates = [BasePredicate] in {
2771    // extracted scalar math op with insert via movss/movsd
2772    def : Pat<(VT (Move (VT VR128:$dst),
2773                        (VT (scalar_to_vector
2774                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2775                                 RC:$src))))),
2776              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2777               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2778    def : Pat<(VT (Move (VT VR128:$dst),
2779                        (VT (scalar_to_vector
2780                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2781                                 (ld_frag addr:$src)))))),
2782              (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2783  }
2784
2785  // Repeat for AVX versions of the instructions.
2786  let Predicates = [UseAVX] in {
2787    // extracted scalar math op with insert via movss/movsd
2788    def : Pat<(VT (Move (VT VR128:$dst),
2789                        (VT (scalar_to_vector
2790                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2791                                 RC:$src))))),
2792              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2793               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2794    def : Pat<(VT (Move (VT VR128:$dst),
2795                        (VT (scalar_to_vector
2796                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2797                                 (ld_frag addr:$src)))))),
2798              (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2799  }
2800}
2801
2802defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2803defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2804defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2805defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2806
2807defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2808defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2809defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2810defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2811
2812/// Unop Arithmetic
2813/// In addition, we also have a special variant of the scalar form here to
2814/// represent the associated intrinsic operation.  This form is unlike the
2815/// plain scalar form, in that it takes an entire vector (instead of a
2816/// scalar) and leaves the top elements undefined.
2817///
2818/// And, we have a special variant form for a full-vector intrinsic form.
2819
2820/// sse_fp_unop_s - SSE1 unops in scalar form
2821/// For the non-AVX defs, we need $src1 to be tied to $dst because
2822/// the HW instructions are 2 operand / destructive.
2823multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2824                          X86MemOperand x86memop, Operand intmemop,
2825                          SDPatternOperator OpNode, Domain d,
2826                          X86FoldableSchedWrite sched, Predicate target> {
2827  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2828  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2829              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2830            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2831            Requires<[target]>;
2832  let mayLoad = 1 in
2833  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2834            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2835            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2836            Sched<[sched.Folded]>,
2837            Requires<[target, OptForSize]>;
2838  }
2839
2840  let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2841  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2842                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2843                Sched<[sched]>;
2844  let mayLoad = 1 in
2845  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2846                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2847                Sched<[sched.Folded, sched.ReadAfterFold]>;
2848  }
2849
2850}
2851
2852multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2853                              Intrinsic Intr, Predicate target> {
2854  let Predicates = [target] in {
2855  // These are unary operations, but they are modeled as having 2 source operands
2856  // because the high elements of the destination are unchanged in SSE.
2857  def : Pat<(Intr VR128:$src),
2858            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2859  }
2860  // We don't want to fold scalar loads into these instructions unless
2861  // optimizing for size. This is because the folded instruction will have a
2862  // partial register update, while the unfolded sequence will not, e.g.
2863  // movss mem, %xmm0
2864  // rcpss %xmm0, %xmm0
2865  // which has a clobber before the rcp, vs.
2866  // rcpss mem, %xmm0
2867  let Predicates = [target, OptForSize] in {
2868    def : Pat<(Intr (mem_frags addr:$src2)),
2869               (!cast<Instruction>(NAME#m_Int)
2870                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2871  }
2872}
2873
2874multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2875                              Intrinsic Intr, Predicate target> {
2876  let Predicates = [target] in {
2877   def : Pat<(Intr VR128:$src),
2878             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2879                                 VR128:$src)>;
2880  }
2881  let Predicates = [target, OptForSize] in {
2882    def : Pat<(Intr (mem_frags addr:$src2)),
2883              (!cast<Instruction>(NAME#m_Int)
2884                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2885  }
2886}
2887
2888multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2889                          ValueType ScalarVT, X86MemOperand x86memop,
2890                          Operand intmemop, SDPatternOperator OpNode, Domain d,
2891                          X86FoldableSchedWrite sched, Predicate target> {
2892  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2893  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2894            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2895            [], d>, Sched<[sched]>;
2896  let mayLoad = 1 in
2897  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2898             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2899            [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2900  }
2901  let hasSideEffects = 0, ExeDomain = d in {
2902  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2903                (ins VR128:$src1, VR128:$src2),
2904             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2905             []>, Sched<[sched]>;
2906  let mayLoad = 1 in
2907  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2908                (ins VR128:$src1, intmemop:$src2),
2909             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2910             []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2911  }
2912
2913  // We don't want to fold scalar loads into these instructions unless
2914  // optimizing for size. This is because the folded instruction will have a
2915  // partial register update, while the unfolded sequence will not, e.g.
2916  // vmovss mem, %xmm0
2917  // vrcpss %xmm0, %xmm0, %xmm0
2918  // which has a clobber before the rcp, vs.
2919  // vrcpss mem, %xmm0, %xmm0
2920  // TODO: In theory, we could fold the load, and avoid the stall caused by
2921  // the partial register store, either in BreakFalseDeps or with smarter RA.
2922  let Predicates = [target] in {
2923   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2924                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2925  }
2926  let Predicates = [target, OptForSize] in {
2927    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2928              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2929            addr:$src)>;
2930  }
2931}
2932
2933/// sse1_fp_unop_p - SSE1 unops in packed form.
2934multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2935                          X86SchedWriteWidths sched, list<Predicate> prds> {
2936let Predicates = prds in {
2937  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2938                       !strconcat("v", OpcodeStr,
2939                                  "ps\t{$src, $dst|$dst, $src}"),
2940                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2941                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2942  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2943                       !strconcat("v", OpcodeStr,
2944                                  "ps\t{$src, $dst|$dst, $src}"),
2945                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2946                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2947  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2948                        !strconcat("v", OpcodeStr,
2949                                   "ps\t{$src, $dst|$dst, $src}"),
2950                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2951                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2952  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2953                        !strconcat("v", OpcodeStr,
2954                                   "ps\t{$src, $dst|$dst, $src}"),
2955                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2956                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2957}
2958
2959  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2960                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2961                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2962                Sched<[sched.XMM]>;
2963  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2964                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2965                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2966                Sched<[sched.XMM.Folded]>;
2967}
2968
2969/// sse2_fp_unop_p - SSE2 unops in vector forms.
2970multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2971                          SDPatternOperator OpNode, X86SchedWriteWidths sched> {
2972let Predicates = [HasAVX, NoVLX] in {
2973  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2974                       !strconcat("v", OpcodeStr,
2975                                  "pd\t{$src, $dst|$dst, $src}"),
2976                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2977                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2978  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2979                       !strconcat("v", OpcodeStr,
2980                                  "pd\t{$src, $dst|$dst, $src}"),
2981                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2982                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2983  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2984                        !strconcat("v", OpcodeStr,
2985                                   "pd\t{$src, $dst|$dst, $src}"),
2986                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2987                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2988  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2989                        !strconcat("v", OpcodeStr,
2990                                   "pd\t{$src, $dst|$dst, $src}"),
2991                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2992                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2993}
2994
2995  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2996                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2997                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2998                Sched<[sched.XMM]>;
2999  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3000                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3001                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
3002                Sched<[sched.XMM.Folded]>;
3003}
3004
3005multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
3006  defm SS        :  sse_fp_unop_s_intr<v4f32, sse_load_f32,
3007                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3008                      UseSSE1>, XS;
3009  defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
3010                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3011                      AVXTarget>,
3012                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
3013}
3014
3015multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3016                          X86SchedWriteWidths sched, Predicate AVXTarget> {
3017  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
3018                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
3019  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
3020                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
3021                       XS, VEX_4V, VEX_LIG, VEX_WIG;
3022}
3023
3024multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3025                          X86SchedWriteWidths sched, Predicate AVXTarget> {
3026  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
3027                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
3028  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
3029                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
3030                         XD, VEX_4V, VEX_LIG, VEX_WIG;
3031}
3032
3033// Square root.
3034defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
3035             sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3036             sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3037             sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3038
3039// Reciprocal approximations. Note that these typically require refinement
3040// in order to obtain suitable precision.
3041defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3042             sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
3043             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3044defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3045             sse1_fp_unop_s_intr<"rcp", HasAVX>,
3046             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3047
3048// There is no f64 version of the reciprocal approximation instructions.
3049
3050multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
3051                                      ValueType VT, Predicate BasePredicate> {
3052  let Predicates = [BasePredicate] in {
3053    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3054                                  (OpNode (extractelt VT:$src, 0))))),
3055              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3056  }
3057
3058  // Repeat for AVX versions of the instructions.
3059  let Predicates = [UseAVX] in {
3060    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3061                                  (OpNode (extractelt VT:$src, 0))))),
3062              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3063  }
3064}
3065
3066defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3067defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3068
3069multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3070                                           SDNode Move, ValueType VT,
3071                                           Predicate BasePredicate> {
3072  let Predicates = [BasePredicate] in {
3073    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3074              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3075  }
3076
3077  // Repeat for AVX versions of the instructions.
3078  let Predicates = [HasAVX] in {
3079    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3080              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3081  }
3082}
3083
3084defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3085                                       v4f32, UseSSE1>;
3086defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3087                                       v4f32, UseSSE1>;
3088
3089
3090//===----------------------------------------------------------------------===//
3091// SSE 1 & 2 - Non-temporal stores
3092//===----------------------------------------------------------------------===//
3093
3094let AddedComplexity = 400 in { // Prefer non-temporal versions
3095let Predicates = [HasAVX, NoVLX] in {
3096let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3097def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3098                     (ins f128mem:$dst, VR128:$src),
3099                     "movntps\t{$src, $dst|$dst, $src}",
3100                     [(alignednontemporalstore (v4f32 VR128:$src),
3101                                               addr:$dst)]>, VEX, VEX_WIG;
3102def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3103                     (ins f128mem:$dst, VR128:$src),
3104                     "movntpd\t{$src, $dst|$dst, $src}",
3105                     [(alignednontemporalstore (v2f64 VR128:$src),
3106                                               addr:$dst)]>, VEX, VEX_WIG;
3107} // SchedRW
3108
3109let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3110def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3111                     (ins f256mem:$dst, VR256:$src),
3112                     "movntps\t{$src, $dst|$dst, $src}",
3113                     [(alignednontemporalstore (v8f32 VR256:$src),
3114                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3115def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3116                     (ins f256mem:$dst, VR256:$src),
3117                     "movntpd\t{$src, $dst|$dst, $src}",
3118                     [(alignednontemporalstore (v4f64 VR256:$src),
3119                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3120} // SchedRW
3121
3122let ExeDomain = SSEPackedInt in {
3123def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3124                         (ins i128mem:$dst, VR128:$src),
3125                         "movntdq\t{$src, $dst|$dst, $src}",
3126                         [(alignednontemporalstore (v2i64 VR128:$src),
3127                                                   addr:$dst)]>, VEX, VEX_WIG,
3128                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3129def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3130                    (ins i256mem:$dst, VR256:$src),
3131                    "movntdq\t{$src, $dst|$dst, $src}",
3132                    [(alignednontemporalstore (v4i64 VR256:$src),
3133                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3134                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3135} // ExeDomain
3136} // Predicates
3137
3138let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3139def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3140                    "movntps\t{$src, $dst|$dst, $src}",
3141                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3142def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3143                    "movntpd\t{$src, $dst|$dst, $src}",
3144                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3145} // SchedRW
3146
3147let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3148def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3149                    "movntdq\t{$src, $dst|$dst, $src}",
3150                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3151
3152let SchedRW = [WriteStoreNT] in {
3153// There is no AVX form for instructions below this point
3154def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3155                 "movnti{l}\t{$src, $dst|$dst, $src}",
3156                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3157               PS, Requires<[HasSSE2]>;
3158def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3159                     "movnti{q}\t{$src, $dst|$dst, $src}",
3160                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3161                  PS, Requires<[HasSSE2]>;
3162} // SchedRW = [WriteStoreNT]
3163
3164let Predicates = [HasAVX, NoVLX] in {
3165  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3166            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3167  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3168            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3169  def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
3170            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3171  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3172            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3173
3174  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3175            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3176  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3177            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3178  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3179            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3180  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3181            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3182}
3183
3184let Predicates = [UseSSE2] in {
3185  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3186            (MOVNTDQmr addr:$dst, VR128:$src)>;
3187  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3188            (MOVNTDQmr addr:$dst, VR128:$src)>;
3189  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3190            (MOVNTDQmr addr:$dst, VR128:$src)>;
3191  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3192            (MOVNTDQmr addr:$dst, VR128:$src)>;
3193}
3194
3195} // AddedComplexity
3196
3197//===----------------------------------------------------------------------===//
3198// SSE 1 & 2 - Prefetch and memory fence
3199//===----------------------------------------------------------------------===//
3200
3201// Prefetch intrinsic.
3202let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3203def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3204    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3205def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3206    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3207def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3208    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3209def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3210    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3211}
3212
3213// FIXME: How should flush instruction be modeled?
3214let SchedRW = [WriteLoad] in {
3215// Flush cache
3216def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3217               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3218               PS, Requires<[HasCLFLUSH]>;
3219}
3220
3221let SchedRW = [WriteNop] in {
3222// Pause. This "instruction" is encoded as "rep; nop", so even though it
3223// was introduced with SSE2, it's backward compatible.
3224def PAUSE : I<0x90, RawFrm, (outs), (ins),
3225              "pause", [(int_x86_sse2_pause)]>, OBXS;
3226}
3227
3228let SchedRW = [WriteFence] in {
3229// Load, store, and memory fence
3230// TODO: As with mfence, we may want to ease the availability of sfence/lfence
3231// to include any 64-bit target.
3232def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3233               PS, Requires<[HasSSE1]>;
3234def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3235               PS, Requires<[HasSSE2]>;
3236def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3237               PS, Requires<[HasMFence]>;
3238} // SchedRW
3239
3240def : Pat<(X86MFence), (MFENCE)>;
3241
3242//===----------------------------------------------------------------------===//
3243// SSE 1 & 2 - Load/Store XCSR register
3244//===----------------------------------------------------------------------===//
3245
3246let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
3247def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3248               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3249               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3250let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
3251def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3252               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3253               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3254
3255let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
3256def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3257              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3258              PS, Sched<[WriteLDMXCSR]>;
3259let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
3260def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3261              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3262              PS, Sched<[WriteSTMXCSR]>;
3263
3264//===---------------------------------------------------------------------===//
3265// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3266//===---------------------------------------------------------------------===//
3267
3268let ExeDomain = SSEPackedInt in { // SSE integer instructions
3269
3270let hasSideEffects = 0 in {
3271def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3272                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3273                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3274def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3275                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3276                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3277def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3278                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3279                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3280def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3281                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3282                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3283}
3284
3285// For Disassembler
3286let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3287def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3288                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3289                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3290                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3291def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3292                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3293                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3294                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3295def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3296                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3297                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3298                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3299def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3300                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3301                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3302                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3303}
3304
3305let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3306    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3307def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3308                      "movdqa\t{$src, $dst|$dst, $src}",
3309                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3310                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3311def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3312                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3313                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3314                      VEX, VEX_L, VEX_WIG;
3315def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3316                   "vmovdqu\t{$src, $dst|$dst, $src}",
3317                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3318                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3319                   XS, VEX, VEX_WIG;
3320def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3321                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3322                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3323                   XS, VEX, VEX_L, VEX_WIG;
3324}
3325
3326let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3327def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3328                      (ins i128mem:$dst, VR128:$src),
3329                      "movdqa\t{$src, $dst|$dst, $src}",
3330                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3331                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3332def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3333                      (ins i256mem:$dst, VR256:$src),
3334                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3335                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3336def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3337                   "vmovdqu\t{$src, $dst|$dst, $src}",
3338                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3339                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3340def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3341                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3342                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3343}
3344
3345let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3346let hasSideEffects = 0 in {
3347def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3348                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3349
3350def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3351                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3352                   XS, Requires<[UseSSE2]>;
3353}
3354
3355// For Disassembler
3356let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3357def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3358                       "movdqa\t{$src, $dst|$dst, $src}", []>,
3359                       FoldGenData<"MOVDQArr">;
3360
3361def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3362                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3363                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3364}
3365} // SchedRW
3366
3367let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3368    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3369def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3370                   "movdqa\t{$src, $dst|$dst, $src}",
3371                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3372def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3373                   "movdqu\t{$src, $dst|$dst, $src}",
3374                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3375                 XS, Requires<[UseSSE2]>;
3376}
3377
3378let mayStore = 1, hasSideEffects = 0,
3379    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3380def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3381                   "movdqa\t{$src, $dst|$dst, $src}",
3382                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3383def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3384                   "movdqu\t{$src, $dst|$dst, $src}",
3385                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3386                 XS, Requires<[UseSSE2]>;
3387}
3388
3389} // ExeDomain = SSEPackedInt
3390
3391// Reversed version with ".s" suffix for GAS compatibility.
3392def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3393                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3394def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3395                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3396def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3397                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3398def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3399                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3400
3401// Reversed version with ".s" suffix for GAS compatibility.
3402def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3403                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3404def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3405                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3406
3407let Predicates = [HasAVX, NoVLX] in {
3408  // Additional patterns for other integer sizes.
3409  def : Pat<(alignedloadv4i32 addr:$src),
3410            (VMOVDQArm addr:$src)>;
3411  def : Pat<(alignedloadv8i16 addr:$src),
3412            (VMOVDQArm addr:$src)>;
3413  def : Pat<(alignedloadv8f16 addr:$src),
3414            (VMOVDQArm addr:$src)>;
3415  def : Pat<(alignedloadv16i8 addr:$src),
3416            (VMOVDQArm addr:$src)>;
3417  def : Pat<(loadv4i32 addr:$src),
3418            (VMOVDQUrm addr:$src)>;
3419  def : Pat<(loadv8i16 addr:$src),
3420            (VMOVDQUrm addr:$src)>;
3421  def : Pat<(loadv8f16 addr:$src),
3422            (VMOVDQUrm addr:$src)>;
3423  def : Pat<(loadv16i8 addr:$src),
3424            (VMOVDQUrm addr:$src)>;
3425
3426  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3427            (VMOVDQAmr addr:$dst, VR128:$src)>;
3428  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3429            (VMOVDQAmr addr:$dst, VR128:$src)>;
3430  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
3431            (VMOVDQAmr addr:$dst, VR128:$src)>;
3432  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3433            (VMOVDQAmr addr:$dst, VR128:$src)>;
3434  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3435            (VMOVDQUmr addr:$dst, VR128:$src)>;
3436  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3437            (VMOVDQUmr addr:$dst, VR128:$src)>;
3438  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
3439            (VMOVDQUmr addr:$dst, VR128:$src)>;
3440  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3441            (VMOVDQUmr addr:$dst, VR128:$src)>;
3442}
3443
3444//===---------------------------------------------------------------------===//
3445// SSE2 - Packed Integer Arithmetic Instructions
3446//===---------------------------------------------------------------------===//
3447
3448let ExeDomain = SSEPackedInt in { // SSE integer instructions
3449
3450/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3451multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3452                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3453                         PatFrag memop_frag, X86MemOperand x86memop,
3454                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3455  let isCommutable = 1 in
3456  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3457       (ins RC:$src1, RC:$src2),
3458       !if(Is2Addr,
3459           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3460           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3461       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3462       Sched<[sched]>;
3463  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3464       (ins RC:$src1, x86memop:$src2),
3465       !if(Is2Addr,
3466           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3467           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3468       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3469                                     (memop_frag addr:$src2))))]>,
3470       Sched<[sched.Folded, sched.ReadAfterFold]>;
3471}
3472} // ExeDomain = SSEPackedInt
3473
3474defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3475                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3476defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3477                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3478defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3479                             SchedWriteVecALU, 1, NoVLX>;
3480defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3481                             SchedWriteVecALU, 1, NoVLX>;
3482defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3483                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3484defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3485                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3486defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3487                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3488defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3489                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3490defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3491                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3492defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3493                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3494defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3495                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3496defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3497                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3498defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3499                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3500defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3501                             SchedWriteVecALU, 0, NoVLX>;
3502defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3503                             SchedWriteVecALU, 0, NoVLX>;
3504defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3505                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3506defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3507                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3508defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3509                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3510defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3511                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3512defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3513                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3514defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3515                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3516defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3517                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3518defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3519                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3520defm PAVGB   : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
3521                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3522defm PAVGW   : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
3523                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3524defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3525                             SchedWriteVecIMul, 1, NoVLX>;
3526
3527let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3528defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3529                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
3530                              VEX_4V, VEX_WIG;
3531
3532let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3533defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3534                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
3535                               0>, VEX_4V, VEX_L, VEX_WIG;
3536let Constraints = "$src1 = $dst" in
3537defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3538                             memop, i128mem, SchedWriteVecIMul.XMM>;
3539
3540let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3541defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3542                             load, i128mem, SchedWritePSADBW.XMM, 0>,
3543                             VEX_4V, VEX_WIG;
3544let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3545defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3546                             load, i256mem, SchedWritePSADBW.YMM, 0>,
3547                             VEX_4V, VEX_L, VEX_WIG;
3548let Constraints = "$src1 = $dst" in
3549defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3550                            memop, i128mem, SchedWritePSADBW.XMM>;
3551
3552//===---------------------------------------------------------------------===//
3553// SSE2 - Packed Integer Logical Instructions
3554//===---------------------------------------------------------------------===//
3555
3556multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3557                         string OpcodeStr, SDNode OpNode,
3558                         SDNode OpNode2, RegisterClass RC,
3559                         X86FoldableSchedWrite sched,
3560                         X86FoldableSchedWrite schedImm,
3561                         ValueType DstVT, ValueType SrcVT,
3562                         PatFrag ld_frag, bit Is2Addr = 1> {
3563  // src2 is always 128-bit
3564  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3565       (ins RC:$src1, VR128:$src2),
3566       !if(Is2Addr,
3567           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3568           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3569       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3570       Sched<[sched]>;
3571  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3572       (ins RC:$src1, i128mem:$src2),
3573       !if(Is2Addr,
3574           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3575           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3576       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3577                       (SrcVT (ld_frag addr:$src2)))))]>,
3578       Sched<[sched.Folded, sched.ReadAfterFold]>;
3579  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3580       (ins RC:$src1, u8imm:$src2),
3581       !if(Is2Addr,
3582           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3583           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3584       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3585       Sched<[schedImm]>;
3586}
3587
3588multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3589                             string OpcodeStr, SDNode OpNode,
3590                             SDNode OpNode2, ValueType DstVT128,
3591                             ValueType DstVT256, ValueType SrcVT,
3592                             X86SchedWriteWidths sched,
3593                             X86SchedWriteWidths schedImm, Predicate prd> {
3594let Predicates = [HasAVX, prd] in
3595  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3596                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3597                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3598let Predicates = [HasAVX2, prd] in
3599  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3600                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3601                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3602                                VEX_WIG;
3603let Constraints = "$src1 = $dst" in
3604  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3605                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3606                            memop>;
3607}
3608
3609multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3610                        SDNode OpNode, RegisterClass RC, ValueType VT,
3611                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3612  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3613       !if(Is2Addr,
3614           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3615           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3616       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3617       Sched<[sched]>;
3618}
3619
3620multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3621                            SDNode OpNode, X86SchedWriteWidths sched> {
3622let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3623  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3624                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3625let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3626  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3627                               VR256, v32i8, sched.YMM, 0>,
3628                               VEX_4V, VEX_L, VEX_WIG;
3629let Constraints = "$src1 = $dst" in
3630  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3631                           sched.XMM>;
3632}
3633
3634let ExeDomain = SSEPackedInt in {
3635  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3636                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3637                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3638  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3639                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3640                                 SchedWriteVecShiftImm, NoVLX>;
3641  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3642                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3643                                 SchedWriteVecShiftImm, NoVLX>;
3644
3645  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3646                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3647                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3648  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3649                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3650                                 SchedWriteVecShiftImm, NoVLX>;
3651  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3652                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3653                                 SchedWriteVecShiftImm, NoVLX>;
3654
3655  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3656                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3657                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3658  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3659                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3660                                 SchedWriteVecShiftImm, NoVLX>;
3661
3662  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3663                                 SchedWriteShuffle>;
3664  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3665                                 SchedWriteShuffle>;
3666} // ExeDomain = SSEPackedInt
3667
3668//===---------------------------------------------------------------------===//
3669// SSE2 - Packed Integer Comparison Instructions
3670//===---------------------------------------------------------------------===//
3671
3672defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3673                             SchedWriteVecALU, 1, TruePredicate>;
3674defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3675                             SchedWriteVecALU, 1, TruePredicate>;
3676defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3677                             SchedWriteVecALU, 1, TruePredicate>;
3678defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3679                             SchedWriteVecALU, 0, TruePredicate>;
3680defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3681                             SchedWriteVecALU, 0, TruePredicate>;
3682defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3683                             SchedWriteVecALU, 0, TruePredicate>;
3684
3685//===---------------------------------------------------------------------===//
3686// SSE2 - Packed Integer Shuffle Instructions
3687//===---------------------------------------------------------------------===//
3688
3689let ExeDomain = SSEPackedInt in {
3690multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3691                         SDNode OpNode, X86SchedWriteWidths sched,
3692                         Predicate prd> {
3693let Predicates = [HasAVX, prd] in {
3694  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3695                      (ins VR128:$src1, u8imm:$src2),
3696                      !strconcat("v", OpcodeStr,
3697                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3698                      [(set VR128:$dst,
3699                        (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3700                      VEX, Sched<[sched.XMM]>, VEX_WIG;
3701  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3702                      (ins i128mem:$src1, u8imm:$src2),
3703                      !strconcat("v", OpcodeStr,
3704                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3705                     [(set VR128:$dst,
3706                       (vt128 (OpNode (load addr:$src1),
3707                        (i8 timm:$src2))))]>, VEX,
3708                  Sched<[sched.XMM.Folded]>, VEX_WIG;
3709}
3710
3711let Predicates = [HasAVX2, prd] in {
3712  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3713                       (ins VR256:$src1, u8imm:$src2),
3714                       !strconcat("v", OpcodeStr,
3715                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3716                       [(set VR256:$dst,
3717                         (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3718                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3719  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3720                       (ins i256mem:$src1, u8imm:$src2),
3721                       !strconcat("v", OpcodeStr,
3722                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3723                      [(set VR256:$dst,
3724                        (vt256 (OpNode (load addr:$src1),
3725                         (i8 timm:$src2))))]>, VEX, VEX_L,
3726                   Sched<[sched.YMM.Folded]>, VEX_WIG;
3727}
3728
3729let Predicates = [UseSSE2] in {
3730  def ri : Ii8<0x70, MRMSrcReg,
3731               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3732               !strconcat(OpcodeStr,
3733                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3734               [(set VR128:$dst,
3735                 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3736               Sched<[sched.XMM]>;
3737  def mi : Ii8<0x70, MRMSrcMem,
3738               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3739               !strconcat(OpcodeStr,
3740                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3741               [(set VR128:$dst,
3742                 (vt128 (OpNode (memop addr:$src1),
3743                        (i8 timm:$src2))))]>,
3744               Sched<[sched.XMM.Folded]>;
3745}
3746}
3747} // ExeDomain = SSEPackedInt
3748
3749defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3750                             SchedWriteShuffle, NoVLX>, PD;
3751defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3752                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3753defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3754                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3755
3756//===---------------------------------------------------------------------===//
3757// Packed Integer Pack Instructions (SSE & AVX)
3758//===---------------------------------------------------------------------===//
3759
3760let ExeDomain = SSEPackedInt in {
3761multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3762                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3763                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3764                     PatFrag ld_frag, bit Is2Addr = 1> {
3765  def rr : PDI<opc, MRMSrcReg,
3766               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3767               !if(Is2Addr,
3768                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3769                   !strconcat(OpcodeStr,
3770                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3771               [(set RC:$dst,
3772                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3773               Sched<[sched]>;
3774  def rm : PDI<opc, MRMSrcMem,
3775               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3776               !if(Is2Addr,
3777                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3778                   !strconcat(OpcodeStr,
3779                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3780               [(set RC:$dst,
3781                     (OutVT (OpNode (ArgVT RC:$src1),
3782                                    (ld_frag addr:$src2))))]>,
3783               Sched<[sched.Folded, sched.ReadAfterFold]>;
3784}
3785
3786multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3787                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3788                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3789                     PatFrag ld_frag, bit Is2Addr = 1> {
3790  def rr : SS48I<opc, MRMSrcReg,
3791                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3792                 !if(Is2Addr,
3793                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3794                     !strconcat(OpcodeStr,
3795                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3796                 [(set RC:$dst,
3797                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3798                 Sched<[sched]>;
3799  def rm : SS48I<opc, MRMSrcMem,
3800                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3801                 !if(Is2Addr,
3802                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3803                     !strconcat(OpcodeStr,
3804                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3805                 [(set RC:$dst,
3806                       (OutVT (OpNode (ArgVT RC:$src1),
3807                                      (ld_frag addr:$src2))))]>,
3808                 Sched<[sched.Folded, sched.ReadAfterFold]>;
3809}
3810
3811let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3812  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3813                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3814                             VEX_4V, VEX_WIG;
3815  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3816                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3817                             VEX_4V, VEX_WIG;
3818
3819  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3820                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3821                             VEX_4V, VEX_WIG;
3822  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3823                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3824                             VEX_4V, VEX_WIG;
3825}
3826
3827let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3828  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3829                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3830                              VEX_4V, VEX_L, VEX_WIG;
3831  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3832                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3833                              VEX_4V, VEX_L, VEX_WIG;
3834
3835  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3836                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3837                              VEX_4V, VEX_L, VEX_WIG;
3838  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3839                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3840                              VEX_4V, VEX_L, VEX_WIG;
3841}
3842
3843let Constraints = "$src1 = $dst" in {
3844  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3845                            i128mem, SchedWriteShuffle.XMM, memop>;
3846  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3847                            i128mem, SchedWriteShuffle.XMM, memop>;
3848
3849  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3850                            i128mem, SchedWriteShuffle.XMM, memop>;
3851
3852  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3853                            i128mem, SchedWriteShuffle.XMM, memop>;
3854}
3855} // ExeDomain = SSEPackedInt
3856
3857//===---------------------------------------------------------------------===//
3858// SSE2 - Packed Integer Unpack Instructions
3859//===---------------------------------------------------------------------===//
3860
3861let ExeDomain = SSEPackedInt in {
3862multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3863                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3864                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3865                       bit Is2Addr = 1> {
3866  def rr : PDI<opc, MRMSrcReg,
3867      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3868      !if(Is2Addr,
3869          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3870          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3871      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3872      Sched<[sched]>;
3873  def rm : PDI<opc, MRMSrcMem,
3874      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3875      !if(Is2Addr,
3876          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3877          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3878      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3879      Sched<[sched.Folded, sched.ReadAfterFold]>;
3880}
3881
3882let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3883  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3884                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3885                                 VEX_4V, VEX_WIG;
3886  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3887                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3888                                 VEX_4V, VEX_WIG;
3889  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3890                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3891                                 VEX_4V, VEX_WIG;
3892  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3893                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3894                                 VEX_4V, VEX_WIG;
3895}
3896
3897let Predicates = [HasAVX, NoVLX] in {
3898  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3899                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3900                                 VEX_4V, VEX_WIG;
3901  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3902                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3903                                 VEX_4V, VEX_WIG;
3904  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3905                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3906                                 VEX_4V, VEX_WIG;
3907  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3908                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3909                                 VEX_4V, VEX_WIG;
3910}
3911
3912let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3913  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3914                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3915                                  VEX_4V, VEX_L, VEX_WIG;
3916  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3917                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3918                                  VEX_4V, VEX_L, VEX_WIG;
3919  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3920                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3921                                  VEX_4V, VEX_L, VEX_WIG;
3922  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3923                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3924                                  VEX_4V, VEX_L, VEX_WIG;
3925}
3926
3927let Predicates = [HasAVX2, NoVLX] in {
3928  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3929                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3930                                  VEX_4V, VEX_L, VEX_WIG;
3931  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3932                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3933                                  VEX_4V, VEX_L, VEX_WIG;
3934  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3935                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3936                                  VEX_4V, VEX_L, VEX_WIG;
3937  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3938                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3939                                  VEX_4V, VEX_L, VEX_WIG;
3940}
3941
3942let Constraints = "$src1 = $dst" in {
3943  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3944                                i128mem, SchedWriteShuffle.XMM, memop>;
3945  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3946                                i128mem, SchedWriteShuffle.XMM, memop>;
3947  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3948                                i128mem, SchedWriteShuffle.XMM, memop>;
3949  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3950                                i128mem, SchedWriteShuffle.XMM, memop>;
3951
3952  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3953                                i128mem, SchedWriteShuffle.XMM, memop>;
3954  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3955                                i128mem, SchedWriteShuffle.XMM, memop>;
3956  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3957                                i128mem, SchedWriteShuffle.XMM, memop>;
3958  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3959                                i128mem, SchedWriteShuffle.XMM, memop>;
3960}
3961} // ExeDomain = SSEPackedInt
3962
3963//===---------------------------------------------------------------------===//
3964// SSE2 - Packed Integer Extract and Insert
3965//===---------------------------------------------------------------------===//
3966
3967let ExeDomain = SSEPackedInt in {
3968multiclass sse2_pinsrw<bit Is2Addr = 1> {
3969  def rr : Ii8<0xC4, MRMSrcReg,
3970       (outs VR128:$dst), (ins VR128:$src1,
3971        GR32orGR64:$src2, u8imm:$src3),
3972       !if(Is2Addr,
3973           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3974           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3975       [(set VR128:$dst,
3976         (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
3977       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3978  def rm : Ii8<0xC4, MRMSrcMem,
3979                      (outs VR128:$dst), (ins VR128:$src1,
3980                       i16mem:$src2, u8imm:$src3),
3981       !if(Is2Addr,
3982           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3983           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3984       [(set VR128:$dst,
3985         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3986                    timm:$src3))]>,
3987       Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3988}
3989
3990// Extract
3991let Predicates = [HasAVX, NoBWI] in
3992def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3993                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3994                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3995                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3996                                            timm:$src2))]>,
3997                PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3998def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3999                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4000                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4001                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4002                                            timm:$src2))]>,
4003               Sched<[WriteVecExtract]>;
4004
4005// Insert
4006let Predicates = [HasAVX, NoBWI] in
4007defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
4008
4009let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4010defm PINSRW : sse2_pinsrw, PD;
4011
4012} // ExeDomain = SSEPackedInt
4013
4014// Always select FP16 instructions if available.
4015let Predicates = [UseSSE2], AddedComplexity = -10 in {
4016  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4017  def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
4018  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4019  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4020}
4021
4022let Predicates = [HasAVX, NoBWI] in {
4023  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4024  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4025  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4026}
4027
4028//===---------------------------------------------------------------------===//
4029// SSE2 - Packed Mask Creation
4030//===---------------------------------------------------------------------===//
4031
4032let ExeDomain = SSEPackedInt in {
4033
4034def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4035           (ins VR128:$src),
4036           "pmovmskb\t{$src, $dst|$dst, $src}",
4037           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4038           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
4039
4040let Predicates = [HasAVX2] in {
4041def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4042           (ins VR256:$src),
4043           "pmovmskb\t{$src, $dst|$dst, $src}",
4044           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
4045           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
4046}
4047
4048def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4049           "pmovmskb\t{$src, $dst|$dst, $src}",
4050           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4051           Sched<[WriteVecMOVMSK]>;
4052
4053} // ExeDomain = SSEPackedInt
4054
4055//===---------------------------------------------------------------------===//
4056// SSE2 - Conditional Store
4057//===---------------------------------------------------------------------===//
4058
4059let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4060// As VEX does not have separate instruction contexts for address size
4061// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict.
4062// Prefer VMASKMODDQU64.
4063let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in
4064def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4065           (ins VR128:$src, VR128:$mask),
4066           "maskmovdqu\t{$mask, $src|$src, $mask}",
4067           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4068           VEX, VEX_WIG;
4069let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4070def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4071           (ins VR128:$src, VR128:$mask),
4072           "maskmovdqu\t{$mask, $src|$src, $mask}",
4073           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4074           VEX, VEX_WIG;
4075
4076let Uses = [EDI], Predicates = [UseSSE2] in
4077def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4078           "maskmovdqu\t{$mask, $src|$src, $mask}",
4079           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4080let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4081def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4082           "maskmovdqu\t{$mask, $src|$src, $mask}",
4083           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4084
4085} // ExeDomain = SSEPackedInt
4086
4087//===---------------------------------------------------------------------===//
4088// SSE2 - Move Doubleword/Quadword
4089//===---------------------------------------------------------------------===//
4090
4091//===---------------------------------------------------------------------===//
4092// Move Int Doubleword to Packed Double Int
4093//
4094let ExeDomain = SSEPackedInt in {
4095def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4096                        "movd\t{$src, $dst|$dst, $src}",
4097                        [(set VR128:$dst,
4098                          (v4i32 (scalar_to_vector GR32:$src)))]>,
4099                          VEX, Sched<[WriteVecMoveFromGpr]>;
4100def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4101                        "movd\t{$src, $dst|$dst, $src}",
4102                        [(set VR128:$dst,
4103                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4104                        VEX, Sched<[WriteVecLoad]>;
4105def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4106                          "movq\t{$src, $dst|$dst, $src}",
4107                          [(set VR128:$dst,
4108                            (v2i64 (scalar_to_vector GR64:$src)))]>,
4109                          VEX, Sched<[WriteVecMoveFromGpr]>;
4110let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4111def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4112                          "movq\t{$src, $dst|$dst, $src}", []>,
4113                          VEX, Sched<[WriteVecLoad]>;
4114let isCodeGenOnly = 1 in
4115def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4116                         "movq\t{$src, $dst|$dst, $src}",
4117                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
4118                         VEX, Sched<[WriteVecMoveFromGpr]>;
4119
4120def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4121                      "movd\t{$src, $dst|$dst, $src}",
4122                      [(set VR128:$dst,
4123                        (v4i32 (scalar_to_vector GR32:$src)))]>,
4124                      Sched<[WriteVecMoveFromGpr]>;
4125def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4126                      "movd\t{$src, $dst|$dst, $src}",
4127                      [(set VR128:$dst,
4128                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4129                      Sched<[WriteVecLoad]>;
4130def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4131                        "movq\t{$src, $dst|$dst, $src}",
4132                        [(set VR128:$dst,
4133                          (v2i64 (scalar_to_vector GR64:$src)))]>,
4134                        Sched<[WriteVecMoveFromGpr]>;
4135let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4136def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4137                        "movq\t{$src, $dst|$dst, $src}", []>,
4138                        Sched<[WriteVecLoad]>;
4139let isCodeGenOnly = 1 in
4140def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4141                       "movq\t{$src, $dst|$dst, $src}",
4142                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4143                       Sched<[WriteVecMoveFromGpr]>;
4144} // ExeDomain = SSEPackedInt
4145
4146//===---------------------------------------------------------------------===//
4147// Move Int Doubleword to Single Scalar
4148//
4149let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4150  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4151                        "movd\t{$src, $dst|$dst, $src}",
4152                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4153                        VEX, Sched<[WriteVecMoveFromGpr]>;
4154
4155  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4156                        "movd\t{$src, $dst|$dst, $src}",
4157                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4158                        Sched<[WriteVecMoveFromGpr]>;
4159
4160} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4161
4162//===---------------------------------------------------------------------===//
4163// Move Packed Doubleword Int to Packed Double Int
4164//
4165let ExeDomain = SSEPackedInt in {
4166def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4167                         "movd\t{$src, $dst|$dst, $src}",
4168                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4169                                          (iPTR 0)))]>, VEX,
4170                         Sched<[WriteVecMoveToGpr]>;
4171def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4172                         (ins i32mem:$dst, VR128:$src),
4173                         "movd\t{$src, $dst|$dst, $src}",
4174                         [(store (i32 (extractelt (v4i32 VR128:$src),
4175                                       (iPTR 0))), addr:$dst)]>,
4176                         VEX, Sched<[WriteVecStore]>;
4177def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4178                       "movd\t{$src, $dst|$dst, $src}",
4179                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4180                                        (iPTR 0)))]>,
4181                   Sched<[WriteVecMoveToGpr]>;
4182def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4183                       "movd\t{$src, $dst|$dst, $src}",
4184                       [(store (i32 (extractelt (v4i32 VR128:$src),
4185                                     (iPTR 0))), addr:$dst)]>,
4186                       Sched<[WriteVecStore]>;
4187} // ExeDomain = SSEPackedInt
4188
4189//===---------------------------------------------------------------------===//
4190// Move Packed Doubleword Int first element to Doubleword Int
4191//
4192let ExeDomain = SSEPackedInt in {
4193let SchedRW = [WriteVecMoveToGpr] in {
4194def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4195                          "movq\t{$src, $dst|$dst, $src}",
4196                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4197                                                        (iPTR 0)))]>,
4198                      VEX;
4199
4200def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4201                        "movq\t{$src, $dst|$dst, $src}",
4202                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4203                                                         (iPTR 0)))]>;
4204} //SchedRW
4205
4206let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4207def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4208                          (ins i64mem:$dst, VR128:$src),
4209                          "movq\t{$src, $dst|$dst, $src}", []>,
4210                          VEX, Sched<[WriteVecStore]>;
4211let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4212def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4213                        "movq\t{$src, $dst|$dst, $src}", []>,
4214                        Sched<[WriteVecStore]>;
4215} // ExeDomain = SSEPackedInt
4216
4217//===---------------------------------------------------------------------===//
4218// Bitcast FR64 <-> GR64
4219//
4220let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4221  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4222                           "movq\t{$src, $dst|$dst, $src}",
4223                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4224                           VEX, Sched<[WriteVecMoveToGpr]>;
4225
4226  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4227                         "movq\t{$src, $dst|$dst, $src}",
4228                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4229                         Sched<[WriteVecMoveToGpr]>;
4230} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4231
4232//===---------------------------------------------------------------------===//
4233// Move Scalar Single to Double Int
4234//
4235let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4236  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4237                        "movd\t{$src, $dst|$dst, $src}",
4238                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4239                        VEX, Sched<[WriteVecMoveToGpr]>;
4240  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4241                        "movd\t{$src, $dst|$dst, $src}",
4242                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4243                        Sched<[WriteVecMoveToGpr]>;
4244} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4245
4246let Predicates = [UseAVX] in {
4247  def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))),
4248            (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
4249                                              GR8:$src, sub_8bit)))>;
4250  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4251            (VMOVDI2PDIrr GR32:$src)>;
4252
4253  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4254            (VMOV64toPQIrr GR64:$src)>;
4255
4256  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4257  // These instructions also write zeros in the high part of a 256-bit register.
4258  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4259            (VMOVDI2PDIrm addr:$src)>;
4260  def : Pat<(v8i32 (X86vzload32 addr:$src)),
4261            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4262}
4263
4264let Predicates = [UseSSE2] in {
4265  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4266            (MOVDI2PDIrr GR32:$src)>;
4267
4268  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4269            (MOV64toPQIrr GR64:$src)>;
4270  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4271            (MOVDI2PDIrm addr:$src)>;
4272}
4273
4274// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4275// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4276// these aliases.
4277def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4278                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4279def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4280                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4281// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4282def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4283                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4284def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4285                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4286
4287//===---------------------------------------------------------------------===//
4288// SSE2 - Move Quadword
4289//===---------------------------------------------------------------------===//
4290
4291//===---------------------------------------------------------------------===//
4292// Move Quadword Int to Packed Quadword Int
4293//
4294
4295let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4296def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4297                    "vmovq\t{$src, $dst|$dst, $src}",
4298                    [(set VR128:$dst,
4299                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4300                    VEX, Requires<[UseAVX]>, VEX_WIG;
4301def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4302                    "movq\t{$src, $dst|$dst, $src}",
4303                    [(set VR128:$dst,
4304                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4305                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4306} // ExeDomain, SchedRW
4307
4308//===---------------------------------------------------------------------===//
4309// Move Packed Quadword Int to Quadword Int
4310//
4311let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4312def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4313                        "movq\t{$src, $dst|$dst, $src}",
4314                        [(store (i64 (extractelt (v2i64 VR128:$src),
4315                                      (iPTR 0))), addr:$dst)]>,
4316                        VEX, VEX_WIG;
4317def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4318                      "movq\t{$src, $dst|$dst, $src}",
4319                      [(store (i64 (extractelt (v2i64 VR128:$src),
4320                                    (iPTR 0))), addr:$dst)]>;
4321} // ExeDomain, SchedRW
4322
4323// For disassembler only
4324let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4325    SchedRW = [SchedWriteVecLogic.XMM] in {
4326def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4327                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4328def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4329                      "movq\t{$src, $dst|$dst, $src}", []>;
4330}
4331
4332def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4333                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4334def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4335                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4336
4337let Predicates = [UseAVX] in {
4338  def : Pat<(v2i64 (X86vzload64 addr:$src)),
4339            (VMOVQI2PQIrm addr:$src)>;
4340  def : Pat<(v4i64 (X86vzload64 addr:$src)),
4341            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4342
4343  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4344            (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4345}
4346
4347let Predicates = [UseSSE2] in {
4348  def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4349
4350  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4351            (MOVPQI2QImr addr:$dst, VR128:$src)>;
4352}
4353
4354//===---------------------------------------------------------------------===//
4355// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4356// IA32 document. movq xmm1, xmm2 does clear the high bits.
4357//
4358let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4359def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4360                        "vmovq\t{$src, $dst|$dst, $src}",
4361                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4362                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4363def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4364                        "movq\t{$src, $dst|$dst, $src}",
4365                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4366                        XS, Requires<[UseSSE2]>;
4367} // ExeDomain, SchedRW
4368
4369let Predicates = [UseAVX] in {
4370  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4371            (VMOVZPQILo2PQIrr VR128:$src)>;
4372}
4373let Predicates = [UseSSE2] in {
4374  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4375            (MOVZPQILo2PQIrr VR128:$src)>;
4376}
4377
4378let Predicates = [UseAVX] in {
4379  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4380            (SUBREG_TO_REG (i32 0),
4381             (v2f64 (VMOVZPQILo2PQIrr
4382                     (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4383             sub_xmm)>;
4384  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4385            (SUBREG_TO_REG (i32 0),
4386             (v2i64 (VMOVZPQILo2PQIrr
4387                     (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4388             sub_xmm)>;
4389}
4390
4391//===---------------------------------------------------------------------===//
4392// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4393//===---------------------------------------------------------------------===//
4394
4395multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4396                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4397                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4398def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4399                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4400                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4401                      Sched<[sched]>;
4402def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4403                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4404                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4405                      Sched<[sched.Folded]>;
4406}
4407
4408let Predicates = [HasAVX, NoVLX] in {
4409  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4410                                       v4f32, VR128, loadv4f32, f128mem,
4411                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4412  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4413                                       v4f32, VR128, loadv4f32, f128mem,
4414                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4415  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4416                                       v8f32, VR256, loadv8f32, f256mem,
4417                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4418  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4419                                       v8f32, VR256, loadv8f32, f256mem,
4420                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4421}
4422defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4423                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4424defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4425                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4426
4427let Predicates = [HasAVX, NoVLX] in {
4428  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4429            (VMOVSHDUPrr VR128:$src)>;
4430  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4431            (VMOVSHDUPrm addr:$src)>;
4432  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4433            (VMOVSLDUPrr VR128:$src)>;
4434  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4435            (VMOVSLDUPrm addr:$src)>;
4436  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4437            (VMOVSHDUPYrr VR256:$src)>;
4438  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4439            (VMOVSHDUPYrm addr:$src)>;
4440  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4441            (VMOVSLDUPYrr VR256:$src)>;
4442  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4443            (VMOVSLDUPYrm addr:$src)>;
4444}
4445
4446let Predicates = [UseSSE3] in {
4447  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4448            (MOVSHDUPrr VR128:$src)>;
4449  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4450            (MOVSHDUPrm addr:$src)>;
4451  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4452            (MOVSLDUPrr VR128:$src)>;
4453  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4454            (MOVSLDUPrm addr:$src)>;
4455}
4456
4457//===---------------------------------------------------------------------===//
4458// SSE3 - Replicate Double FP - MOVDDUP
4459//===---------------------------------------------------------------------===//
4460
4461multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4462def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4463                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4464                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4465                    Sched<[sched.XMM]>;
4466def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4467                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4468                    [(set VR128:$dst,
4469                      (v2f64 (X86Movddup
4470                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4471                    Sched<[sched.XMM.Folded]>;
4472}
4473
4474// FIXME: Merge with above classes when there are patterns for the ymm version
4475multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4476def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4477                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4478                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4479                    Sched<[sched.YMM]>;
4480def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4481                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4482                    [(set VR256:$dst,
4483                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4484                    Sched<[sched.YMM.Folded]>;
4485}
4486
4487let Predicates = [HasAVX, NoVLX] in {
4488  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4489                                      VEX, VEX_WIG;
4490  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4491                                        VEX, VEX_L, VEX_WIG;
4492}
4493
4494defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4495
4496
4497let Predicates = [HasAVX, NoVLX] in {
4498  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4499            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4500}
4501
4502let Predicates = [UseSSE3] in {
4503  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4504            (MOVDDUPrm addr:$src)>;
4505}
4506
4507//===---------------------------------------------------------------------===//
4508// SSE3 - Move Unaligned Integer
4509//===---------------------------------------------------------------------===//
4510
4511let Predicates = [HasAVX] in {
4512  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4513                      "vlddqu\t{$src, $dst|$dst, $src}",
4514                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4515                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4516  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4517                       "vlddqu\t{$src, $dst|$dst, $src}",
4518                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4519                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4520} // Predicates
4521
4522def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4523                   "lddqu\t{$src, $dst|$dst, $src}",
4524                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4525                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4526
4527//===---------------------------------------------------------------------===//
4528// SSE3 - Arithmetic
4529//===---------------------------------------------------------------------===//
4530
4531multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4532                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4533                       PatFrag ld_frag, bit Is2Addr = 1> {
4534let Uses = [MXCSR], mayRaiseFPException = 1 in {
4535  def rr : I<0xD0, MRMSrcReg,
4536       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4537       !if(Is2Addr,
4538           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4539           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4540       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4541       Sched<[sched]>;
4542  def rm : I<0xD0, MRMSrcMem,
4543       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4544       !if(Is2Addr,
4545           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4546           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4547       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4548       Sched<[sched.Folded, sched.ReadAfterFold]>;
4549}
4550}
4551
4552let Predicates = [HasAVX] in {
4553  let ExeDomain = SSEPackedSingle in {
4554    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4555                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4556                                 XD, VEX_4V, VEX_WIG;
4557    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4558                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4559                                  XD, VEX_4V, VEX_L, VEX_WIG;
4560  }
4561  let ExeDomain = SSEPackedDouble in {
4562    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4563                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4564                                 PD, VEX_4V, VEX_WIG;
4565    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4566                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4567                                  PD, VEX_4V, VEX_L, VEX_WIG;
4568  }
4569}
4570let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4571  let ExeDomain = SSEPackedSingle in
4572  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4573                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4574  let ExeDomain = SSEPackedDouble in
4575  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4576                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4577}
4578
4579//===---------------------------------------------------------------------===//
4580// SSE3 Instructions
4581//===---------------------------------------------------------------------===//
4582
4583// Horizontal ops
4584multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4585                   X86MemOperand x86memop, SDNode OpNode,
4586                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4587                   bit Is2Addr = 1> {
4588let Uses = [MXCSR], mayRaiseFPException = 1 in {
4589  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4590       !if(Is2Addr,
4591         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4592         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4593      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4594      Sched<[sched]>;
4595
4596  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4597       !if(Is2Addr,
4598         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4599         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4600      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4601      Sched<[sched.Folded, sched.ReadAfterFold]>;
4602}
4603}
4604multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4605                  X86MemOperand x86memop, SDNode OpNode,
4606                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4607                  bit Is2Addr = 1> {
4608let Uses = [MXCSR], mayRaiseFPException = 1 in {
4609  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4610       !if(Is2Addr,
4611         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4612         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4613      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4614        Sched<[sched]>;
4615
4616  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4617       !if(Is2Addr,
4618         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4619         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4620      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4621        Sched<[sched.Folded, sched.ReadAfterFold]>;
4622}
4623}
4624
4625let Predicates = [HasAVX] in {
4626  let ExeDomain = SSEPackedSingle in {
4627    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4628                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4629    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4630                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4631    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4632                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4633    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4634                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4635  }
4636  let ExeDomain = SSEPackedDouble in {
4637    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4638                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4639    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4640                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4641    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4642                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4643    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4644                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4645  }
4646}
4647
4648let Constraints = "$src1 = $dst" in {
4649  let ExeDomain = SSEPackedSingle in {
4650    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4651                          WriteFHAdd, memopv4f32>;
4652    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4653                          WriteFHAdd, memopv4f32>;
4654  }
4655  let ExeDomain = SSEPackedDouble in {
4656    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4657                         WriteFHAdd, memopv2f64>;
4658    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4659                         WriteFHAdd, memopv2f64>;
4660  }
4661}
4662
4663//===---------------------------------------------------------------------===//
4664// SSSE3 - Packed Absolute Instructions
4665//===---------------------------------------------------------------------===//
4666
4667/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4668multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4669                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4670  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4671                 (ins VR128:$src),
4672                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4673                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4674                 Sched<[sched.XMM]>;
4675
4676  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4677                 (ins i128mem:$src),
4678                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4679                 [(set VR128:$dst,
4680                   (vt (OpNode (ld_frag addr:$src))))]>,
4681                 Sched<[sched.XMM.Folded]>;
4682}
4683
4684/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4685multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4686                          SDNode OpNode, X86SchedWriteWidths sched> {
4687  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4688                  (ins VR256:$src),
4689                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4690                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4691                  Sched<[sched.YMM]>;
4692
4693  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4694                  (ins i256mem:$src),
4695                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4696                  [(set VR256:$dst,
4697                    (vt (OpNode (load addr:$src))))]>,
4698                  Sched<[sched.YMM.Folded]>;
4699}
4700
4701let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4702  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4703                              load>, VEX, VEX_WIG;
4704  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4705                              load>, VEX, VEX_WIG;
4706}
4707let Predicates = [HasAVX, NoVLX] in {
4708  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4709                              load>, VEX, VEX_WIG;
4710}
4711let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4712  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4713                                VEX, VEX_L, VEX_WIG;
4714  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4715                                VEX, VEX_L, VEX_WIG;
4716}
4717let Predicates = [HasAVX2, NoVLX] in {
4718  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4719                                VEX, VEX_L, VEX_WIG;
4720}
4721
4722defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4723                          memop>;
4724defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4725                          memop>;
4726defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4727                          memop>;
4728
4729//===---------------------------------------------------------------------===//
4730// SSSE3 - Packed Binary Operator Instructions
4731//===---------------------------------------------------------------------===//
4732
4733/// SS3I_binop_rm - Simple SSSE3 bin op
4734multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4735                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4736                         PatFrag memop_frag, X86MemOperand x86memop,
4737                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4738  let isCommutable = 1 in
4739  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4740       (ins RC:$src1, RC:$src2),
4741       !if(Is2Addr,
4742         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4743         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4744       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4745       Sched<[sched]>;
4746  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4747       (ins RC:$src1, x86memop:$src2),
4748       !if(Is2Addr,
4749         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4750         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4751       [(set RC:$dst,
4752         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4753       Sched<[sched.Folded, sched.ReadAfterFold]>;
4754}
4755
4756/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4757multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4758                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4759                             PatFrag ld_frag, bit Is2Addr = 1> {
4760  let isCommutable = 1 in
4761  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4762       (ins VR128:$src1, VR128:$src2),
4763       !if(Is2Addr,
4764         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4765         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4766       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4767       Sched<[sched]>;
4768  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4769       (ins VR128:$src1, i128mem:$src2),
4770       !if(Is2Addr,
4771         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4772         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4773       [(set VR128:$dst,
4774         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4775       Sched<[sched.Folded, sched.ReadAfterFold]>;
4776}
4777
4778multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4779                               Intrinsic IntId256,
4780                               X86FoldableSchedWrite sched> {
4781  let isCommutable = 1 in
4782  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4783       (ins VR256:$src1, VR256:$src2),
4784       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4785       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4786       Sched<[sched]>;
4787  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4788       (ins VR256:$src1, i256mem:$src2),
4789       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4790       [(set VR256:$dst,
4791         (IntId256 VR256:$src1, (load addr:$src2)))]>,
4792       Sched<[sched.Folded, sched.ReadAfterFold]>;
4793}
4794
4795let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4796let isCommutable = 0 in {
4797  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4798                                  VR128, load, i128mem,
4799                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4800  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4801                                  v16i8, VR128, load, i128mem,
4802                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4803}
4804defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4805                                  VR128, load, i128mem,
4806                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4807}
4808
4809let ImmT = NoImm, Predicates = [HasAVX] in {
4810let isCommutable = 0 in {
4811  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4812                                  load, i128mem,
4813                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4814  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4815                                  load, i128mem,
4816                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4817  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4818                                  load, i128mem,
4819                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4820  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4821                                  load, i128mem,
4822                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4823  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4824                                      int_x86_ssse3_psign_b_128,
4825                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4826  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4827                                      int_x86_ssse3_psign_w_128,
4828                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4829  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4830                                      int_x86_ssse3_psign_d_128,
4831                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4832  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4833                                      int_x86_ssse3_phadd_sw_128,
4834                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4835  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4836                                      int_x86_ssse3_phsub_sw_128,
4837                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4838}
4839}
4840
4841let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4842let isCommutable = 0 in {
4843  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4844                                  VR256, load, i256mem,
4845                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4846  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4847                                   v32i8, VR256, load, i256mem,
4848                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4849}
4850defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4851                                  VR256, load, i256mem,
4852                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4853}
4854
4855let ImmT = NoImm, Predicates = [HasAVX2] in {
4856let isCommutable = 0 in {
4857  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4858                                  VR256, load, i256mem,
4859                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4860  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4861                                  load, i256mem,
4862                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4863  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4864                                  VR256, load, i256mem,
4865                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4866  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4867                                  load, i256mem,
4868                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4869  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4870                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4871  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4872                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4873  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4874                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4875  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4876                                       int_x86_avx2_phadd_sw,
4877                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4878  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4879                                       int_x86_avx2_phsub_sw,
4880                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4881}
4882}
4883
4884// None of these have i8 immediate fields.
4885let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4886let isCommutable = 0 in {
4887  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4888                                 memop, i128mem, SchedWritePHAdd.XMM>;
4889  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4890                                 memop, i128mem, SchedWritePHAdd.XMM>;
4891  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4892                                 memop, i128mem, SchedWritePHAdd.XMM>;
4893  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4894                                 memop, i128mem, SchedWritePHAdd.XMM>;
4895  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4896                                     SchedWriteVecALU.XMM, memop>;
4897  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4898                                     SchedWriteVecALU.XMM, memop>;
4899  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4900                                     SchedWriteVecALU.XMM, memop>;
4901  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4902                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
4903  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4904                                     int_x86_ssse3_phadd_sw_128,
4905                                     SchedWritePHAdd.XMM, memop>;
4906  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4907                                     int_x86_ssse3_phsub_sw_128,
4908                                     SchedWritePHAdd.XMM, memop>;
4909  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4910                                 v16i8, VR128, memop, i128mem,
4911                                 SchedWriteVecIMul.XMM>;
4912}
4913defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4914                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4915}
4916
4917//===---------------------------------------------------------------------===//
4918// SSSE3 - Packed Align Instruction Patterns
4919//===---------------------------------------------------------------------===//
4920
4921multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4922                         PatFrag memop_frag, X86MemOperand x86memop,
4923                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4924  let hasSideEffects = 0 in {
4925  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4926      (ins RC:$src1, RC:$src2, u8imm:$src3),
4927      !if(Is2Addr,
4928        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4929        !strconcat(asm,
4930                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4931      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4932      Sched<[sched]>;
4933  let mayLoad = 1 in
4934  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4935      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4936      !if(Is2Addr,
4937        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4938        !strconcat(asm,
4939                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4940      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4941                                     (memop_frag addr:$src2),
4942                                     (i8 timm:$src3))))]>,
4943      Sched<[sched.Folded, sched.ReadAfterFold]>;
4944  }
4945}
4946
4947let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4948  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4949                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4950let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4951  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4952                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4953let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4954  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4955                               SchedWriteShuffle.XMM>;
4956
4957//===---------------------------------------------------------------------===//
4958// SSSE3 - Thread synchronization
4959//===---------------------------------------------------------------------===//
4960
4961let SchedRW = [WriteSystem] in {
4962let Uses = [EAX, ECX, EDX] in
4963def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4964                     TB, Requires<[HasSSE3, Not64BitMode]>;
4965let Uses = [RAX, ECX, EDX] in
4966def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4967                     TB, Requires<[HasSSE3, In64BitMode]>;
4968
4969let Uses = [ECX, EAX] in
4970def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4971                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4972} // SchedRW
4973
4974def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4975def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4976
4977def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4978      Requires<[Not64BitMode]>;
4979def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4980      Requires<[In64BitMode]>;
4981
4982//===----------------------------------------------------------------------===//
4983// SSE4.1 - Packed Move with Sign/Zero Extend
4984// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4985//===----------------------------------------------------------------------===//
4986
4987multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4988                            RegisterClass OutRC, RegisterClass InRC,
4989                            X86FoldableSchedWrite sched> {
4990  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4991                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4992                 Sched<[sched]>;
4993
4994  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4995                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4996                 Sched<[sched.Folded]>;
4997}
4998
4999multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5000                              X86MemOperand MemOp, X86MemOperand MemYOp,
5001                              Predicate prd> {
5002  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
5003                               SchedWriteShuffle.XMM>;
5004  let Predicates = [HasAVX, prd] in
5005    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5006                                     VR128, VR128, SchedWriteVecExtend.XMM>,
5007                                     VEX, VEX_WIG;
5008  let Predicates = [HasAVX2, prd] in
5009    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5010                                     VR256, VR128, SchedWriteVecExtend.YMM>,
5011                                     VEX, VEX_L, VEX_WIG;
5012}
5013
5014multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5015                          X86MemOperand MemYOp, Predicate prd> {
5016  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5017                                        MemOp, MemYOp, prd>;
5018  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5019                                        !strconcat("pmovzx", OpcodeStr),
5020                                        MemOp, MemYOp, prd>;
5021}
5022
5023defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
5024defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
5025defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
5026
5027defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
5028defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
5029
5030defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
5031
5032// AVX2 Patterns
5033multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
5034                                     SDNode ExtOp, SDNode InVecOp> {
5035  // Register-Register patterns
5036  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5037  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5038            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5039  }
5040  let Predicates = [HasAVX2, NoVLX] in {
5041  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
5042            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5043  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
5044            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5045
5046  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5047            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5048  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
5049            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5050
5051  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5052            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5053  }
5054
5055  // Simple Register-Memory patterns
5056  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5057  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5058            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5059
5060  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5061            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5062  }
5063
5064  let Predicates = [HasAVX2, NoVLX] in {
5065  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5066            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5067  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5068            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5069
5070  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5071            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5072  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5073            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5074
5075  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5076            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5077  }
5078
5079  // AVX2 Register-Memory patterns
5080  let Predicates = [HasAVX2, NoVLX] in {
5081  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5082            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5083
5084  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5085            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5086  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5087            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5088  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5089            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5090
5091  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5092            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5093
5094  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5095            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5096  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
5097            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5098
5099  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5100            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5101  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5102            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5103  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5104            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5105  }
5106}
5107
5108defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5109defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5110
5111// SSE4.1/AVX patterns.
5112multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5113                                SDNode ExtOp> {
5114  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5115  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5116            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5117  }
5118  let Predicates = [HasAVX, NoVLX] in {
5119  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5120            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5121  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5122            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5123
5124  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5125            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5126  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5127            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5128
5129  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5130            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5131  }
5132  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5133  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5134            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5135  }
5136  let Predicates = [HasAVX, NoVLX] in {
5137  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5138            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5139  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5140            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5141
5142  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5143            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5144  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5145            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5146
5147  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5148            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5149  }
5150  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5151  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5152            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5153  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5154            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5155  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5156            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5157  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5158            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5159  }
5160  let Predicates = [HasAVX, NoVLX] in {
5161  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5162            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5163  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5164            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5165  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5166            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5167
5168  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5169            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5170  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5171            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5172
5173  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5174            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5175  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5176            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5177  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5178            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5179  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5180            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5181
5182  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5183            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5184  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5185            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5186  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5187            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5188
5189  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5190            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5191  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5192            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5193  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5194            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5195  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5196            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5197  }
5198}
5199
5200defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5201defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5202
5203let Predicates = [UseSSE41] in {
5204  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5205  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5206}
5207
5208//===----------------------------------------------------------------------===//
5209// SSE4.1 - Extract Instructions
5210//===----------------------------------------------------------------------===//
5211
5212/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5213multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5214  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5215                 (ins VR128:$src1, u8imm:$src2),
5216                 !strconcat(OpcodeStr,
5217                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5218                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5219                                         timm:$src2))]>,
5220                  Sched<[WriteVecExtract]>;
5221  let hasSideEffects = 0, mayStore = 1 in
5222  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5223                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5224                 !strconcat(OpcodeStr,
5225                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5226                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
5227                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5228}
5229
5230let Predicates = [HasAVX, NoBWI] in
5231  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5232
5233defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5234
5235
5236/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5237multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5238  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5239  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5240                   (ins VR128:$src1, u8imm:$src2),
5241                   !strconcat(OpcodeStr,
5242                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5243                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5244
5245  let hasSideEffects = 0, mayStore = 1 in
5246  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5247                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5248                 !strconcat(OpcodeStr,
5249                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5250                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
5251                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5252}
5253
5254let Predicates = [HasAVX, NoBWI] in
5255  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5256
5257defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5258
5259let Predicates = [UseSSE41] in
5260  def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5261
5262let Predicates = [HasAVX, NoBWI] in
5263  def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5264
5265
5266/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5267multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5268  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5269                 (ins VR128:$src1, u8imm:$src2),
5270                 !strconcat(OpcodeStr,
5271                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5272                 [(set GR32:$dst,
5273                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5274                  Sched<[WriteVecExtract]>;
5275  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5276                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5277                 !strconcat(OpcodeStr,
5278                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5279                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5280                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5281}
5282
5283let Predicates = [HasAVX, NoDQI] in
5284  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5285
5286defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5287
5288/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5289multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5290  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5291                 (ins VR128:$src1, u8imm:$src2),
5292                 !strconcat(OpcodeStr,
5293                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5294                 [(set GR64:$dst,
5295                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5296                  Sched<[WriteVecExtract]>;
5297  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5298                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5299                 !strconcat(OpcodeStr,
5300                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5301                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5302                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5303}
5304
5305let Predicates = [HasAVX, NoDQI] in
5306  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5307
5308defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5309
5310/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5311/// destination
5312multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5313  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5314                   (ins VR128:$src1, u8imm:$src2),
5315                   !strconcat(OpcodeStr,
5316                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5317                   [(set GR32orGR64:$dst,
5318                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5319                   Sched<[WriteVecExtract]>;
5320  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5321                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5322                   !strconcat(OpcodeStr,
5323                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5324                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5325                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5326}
5327
5328let ExeDomain = SSEPackedSingle in {
5329  let Predicates = [UseAVX] in
5330    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5331  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5332}
5333
5334//===----------------------------------------------------------------------===//
5335// SSE4.1 - Insert Instructions
5336//===----------------------------------------------------------------------===//
5337
5338multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5339  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5340      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5341      !if(Is2Addr,
5342        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5343        !strconcat(asm,
5344                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5345      [(set VR128:$dst,
5346        (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
5347      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5348  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5349      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5350      !if(Is2Addr,
5351        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5352        !strconcat(asm,
5353                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5354      [(set VR128:$dst,
5355        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
5356                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5357}
5358
5359let Predicates = [HasAVX, NoBWI] in {
5360  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5361  def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3),
5362            (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
5363                       GR8:$src2, sub_8bit), timm:$src3)>;
5364}
5365
5366let Constraints = "$src1 = $dst" in
5367  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5368
5369multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5370  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5371      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5372      !if(Is2Addr,
5373        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5374        !strconcat(asm,
5375                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5376      [(set VR128:$dst,
5377        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5378      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5379  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5380      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5381      !if(Is2Addr,
5382        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5383        !strconcat(asm,
5384                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5385      [(set VR128:$dst,
5386        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5387                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5388}
5389
5390let Predicates = [HasAVX, NoDQI] in
5391  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5392let Constraints = "$src1 = $dst" in
5393  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5394
5395multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5396  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5397      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5398      !if(Is2Addr,
5399        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5400        !strconcat(asm,
5401                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5402      [(set VR128:$dst,
5403        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5404      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5405  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5406      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5407      !if(Is2Addr,
5408        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5409        !strconcat(asm,
5410                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5411      [(set VR128:$dst,
5412        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5413                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5414}
5415
5416let Predicates = [HasAVX, NoDQI] in
5417  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5418let Constraints = "$src1 = $dst" in
5419  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5420
5421// insertps has a few different modes, there's the first two here below which
5422// are optimized inserts that won't zero arbitrary elements in the destination
5423// vector. The next one matches the intrinsic and could zero arbitrary elements
5424// in the target vector.
5425multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5426  let isCommutable = 1 in
5427  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5428      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5429      !if(Is2Addr,
5430        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5431        !strconcat(asm,
5432                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5433      [(set VR128:$dst,
5434        (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5435      Sched<[SchedWriteFShuffle.XMM]>;
5436  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5437      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5438      !if(Is2Addr,
5439        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5440        !strconcat(asm,
5441                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5442      [(set VR128:$dst,
5443        (X86insertps VR128:$src1,
5444                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5445                    timm:$src3))]>,
5446      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5447}
5448
5449let ExeDomain = SSEPackedSingle in {
5450  let Predicates = [UseAVX] in
5451    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5452                     VEX_4V, VEX_WIG;
5453  let Constraints = "$src1 = $dst" in
5454    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5455}
5456
5457//===----------------------------------------------------------------------===//
5458// SSE4.1 - Round Instructions
5459//===----------------------------------------------------------------------===//
5460
5461multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5462                           X86MemOperand x86memop, RegisterClass RC,
5463                           ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
5464                           X86FoldableSchedWrite sched> {
5465  // Intrinsic operation, reg.
5466  // Vector intrinsic operation, reg
5467let Uses = [MXCSR], mayRaiseFPException = 1 in {
5468  def r : SS4AIi8<opc, MRMSrcReg,
5469                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5470                  !strconcat(OpcodeStr,
5471                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5472                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5473                  Sched<[sched]>;
5474
5475  // Vector intrinsic operation, mem
5476  def m : SS4AIi8<opc, MRMSrcMem,
5477                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5478                  !strconcat(OpcodeStr,
5479                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5480                  [(set RC:$dst,
5481                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5482                  Sched<[sched.Folded]>;
5483}
5484}
5485
5486multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5487                          string OpcodeStr, X86FoldableSchedWrite sched> {
5488let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5489  def SSr : SS4AIi8<opcss, MRMSrcReg,
5490        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5491        !strconcat(OpcodeStr,
5492            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5493      []>, Sched<[sched]>;
5494
5495  let mayLoad = 1 in
5496  def SSm : SS4AIi8<opcss, MRMSrcMem,
5497        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5498        !strconcat(OpcodeStr,
5499             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5500        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5501} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5502
5503let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5504  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5505        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5506        !strconcat(OpcodeStr,
5507              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5508        []>, Sched<[sched]>;
5509
5510  let mayLoad = 1 in
5511  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5512        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5513        !strconcat(OpcodeStr,
5514             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5515        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5516} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5517}
5518
5519multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5520                           string OpcodeStr, X86FoldableSchedWrite sched> {
5521let Uses = [MXCSR], mayRaiseFPException = 1 in {
5522let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5523  def SSr : SS4AIi8<opcss, MRMSrcReg,
5524                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5525                    !strconcat(OpcodeStr,
5526                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5527                    []>, Sched<[sched]>;
5528
5529  let mayLoad = 1 in
5530  def SSm : SS4AIi8<opcss, MRMSrcMem,
5531                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5532                    !strconcat(OpcodeStr,
5533                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5534                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5535} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5536
5537let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5538  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5539                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5540                    !strconcat(OpcodeStr,
5541                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5542                    []>, Sched<[sched]>;
5543
5544  let mayLoad = 1 in
5545  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5546                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5547                    !strconcat(OpcodeStr,
5548                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5549                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5550} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5551}
5552}
5553
5554multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5555                            string OpcodeStr, X86FoldableSchedWrite sched,
5556                            ValueType VT32, ValueType VT64,
5557                            SDNode OpNode, bit Is2Addr = 1> {
5558let Uses = [MXCSR], mayRaiseFPException = 1 in {
5559let ExeDomain = SSEPackedSingle in {
5560  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5561        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5562        !if(Is2Addr,
5563            !strconcat(OpcodeStr,
5564                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5565            !strconcat(OpcodeStr,
5566                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5567        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5568        Sched<[sched]>;
5569
5570  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5571        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5572        !if(Is2Addr,
5573            !strconcat(OpcodeStr,
5574                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5575            !strconcat(OpcodeStr,
5576                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5577        [(set VR128:$dst,
5578             (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
5579        Sched<[sched.Folded, sched.ReadAfterFold]>;
5580} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5581
5582let ExeDomain = SSEPackedDouble in {
5583  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5584        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5585        !if(Is2Addr,
5586            !strconcat(OpcodeStr,
5587                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5588            !strconcat(OpcodeStr,
5589                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5590        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5591        Sched<[sched]>;
5592
5593  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5594        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5595        !if(Is2Addr,
5596            !strconcat(OpcodeStr,
5597                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5598            !strconcat(OpcodeStr,
5599                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5600        [(set VR128:$dst,
5601              (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
5602        Sched<[sched.Folded, sched.ReadAfterFold]>;
5603} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5604}
5605}
5606
5607// FP round - roundss, roundps, roundsd, roundpd
5608let Predicates = [HasAVX, NoVLX] in {
5609  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5610    // Intrinsic form
5611    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5612                                     loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5613                                   VEX, VEX_WIG;
5614    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5615                                     loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5616                                   VEX, VEX_L, VEX_WIG;
5617  }
5618
5619  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5620    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5621                                     loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5622                                   VEX, VEX_WIG;
5623    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5624                                     loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5625                                   VEX, VEX_L, VEX_WIG;
5626  }
5627}
5628let Predicates = [UseAVX] in {
5629  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5630                                  v4f32, v2f64, X86RndScales, 0>,
5631                                  VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5632  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5633                                VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5634}
5635
5636let Predicates = [UseAVX] in {
5637  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5638            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5639  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5640            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5641}
5642
5643let Predicates = [UseAVX, OptForSize] in {
5644  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5645            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5646  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5647            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5648}
5649
5650let ExeDomain = SSEPackedSingle in
5651defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5652                                memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5653let ExeDomain = SSEPackedDouble in
5654defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5655                                memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5656
5657defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5658
5659let Constraints = "$src1 = $dst" in
5660defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5661                               v4f32, v2f64, X86RndScales>;
5662
5663let Predicates = [UseSSE41] in {
5664  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5665            (ROUNDSSr FR32:$src1, timm:$src2)>;
5666  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5667            (ROUNDSDr FR64:$src1, timm:$src2)>;
5668}
5669
5670let Predicates = [UseSSE41, OptForSize] in {
5671  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5672            (ROUNDSSm addr:$src1, timm:$src2)>;
5673  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5674            (ROUNDSDm addr:$src1, timm:$src2)>;
5675}
5676
5677//===----------------------------------------------------------------------===//
5678// SSE4.1 - Packed Bit Test
5679//===----------------------------------------------------------------------===//
5680
5681// ptest instruction we'll lower to this in X86ISelLowering primarily from
5682// the intel intrinsic that corresponds to this.
5683let Defs = [EFLAGS], Predicates = [HasAVX] in {
5684def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5685                "vptest\t{$src2, $src1|$src1, $src2}",
5686                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5687                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5688def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5689                "vptest\t{$src2, $src1|$src1, $src2}",
5690                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5691                Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5692                VEX, VEX_WIG;
5693
5694def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5695                "vptest\t{$src2, $src1|$src1, $src2}",
5696                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5697                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5698def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5699                "vptest\t{$src2, $src1|$src1, $src2}",
5700                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5701                Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5702                VEX, VEX_L, VEX_WIG;
5703}
5704
5705let Defs = [EFLAGS] in {
5706def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5707              "ptest\t{$src2, $src1|$src1, $src2}",
5708              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5709              Sched<[SchedWriteVecTest.XMM]>;
5710def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5711              "ptest\t{$src2, $src1|$src1, $src2}",
5712              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5713              Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5714}
5715
5716// The bit test instructions below are AVX only
5717multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5718                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5719                       X86FoldableSchedWrite sched> {
5720  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5721            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5722            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5723            Sched<[sched]>, VEX;
5724  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5725            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5726            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5727            Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5728}
5729
5730let Defs = [EFLAGS], Predicates = [HasAVX] in {
5731let ExeDomain = SSEPackedSingle in {
5732defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5733                            SchedWriteFTest.XMM>;
5734defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5735                            SchedWriteFTest.YMM>, VEX_L;
5736}
5737let ExeDomain = SSEPackedDouble in {
5738defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5739                            SchedWriteFTest.XMM>;
5740defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5741                            SchedWriteFTest.YMM>, VEX_L;
5742}
5743}
5744
5745//===----------------------------------------------------------------------===//
5746// SSE4.1 - Misc Instructions
5747//===----------------------------------------------------------------------===//
5748
5749let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5750  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5751                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5752                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5753                     Sched<[WritePOPCNT]>, OpSize16, XS;
5754  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5755                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5756                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5757                      (implicit EFLAGS)]>,
5758                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5759
5760  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5761                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5762                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5763                     Sched<[WritePOPCNT]>, OpSize32, XS;
5764
5765  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5766                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5767                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5768                      (implicit EFLAGS)]>,
5769                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5770
5771  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5772                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5773                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5774                      Sched<[WritePOPCNT]>, XS;
5775  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5776                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5777                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5778                       (implicit EFLAGS)]>,
5779                       Sched<[WritePOPCNT.Folded]>, XS;
5780}
5781
5782// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5783multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5784                                 SDNode OpNode, PatFrag ld_frag,
5785                                 X86FoldableSchedWrite Sched> {
5786  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5787                 (ins VR128:$src),
5788                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5789                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5790                 Sched<[Sched]>;
5791  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5792                  (ins i128mem:$src),
5793                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5794                  [(set VR128:$dst,
5795                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
5796                 Sched<[Sched.Folded]>;
5797}
5798
5799// PHMIN has the same profile as PSAD, thus we use the same scheduling
5800// model, although the naming is misleading.
5801let Predicates = [HasAVX] in
5802defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5803                                         X86phminpos, load,
5804                                         WritePHMINPOS>, VEX, VEX_WIG;
5805defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5806                                         X86phminpos, memop,
5807                                         WritePHMINPOS>;
5808
5809/// SS48I_binop_rm - Simple SSE41 binary operator.
5810multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5811                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5812                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5813                          bit Is2Addr = 1> {
5814  let isCommutable = 1 in
5815  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5816       (ins RC:$src1, RC:$src2),
5817       !if(Is2Addr,
5818           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5819           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5820       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5821       Sched<[sched]>;
5822  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5823       (ins RC:$src1, x86memop:$src2),
5824       !if(Is2Addr,
5825           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5826           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5827       [(set RC:$dst,
5828         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5829       Sched<[sched.Folded, sched.ReadAfterFold]>;
5830}
5831
5832let Predicates = [HasAVX, NoVLX] in {
5833  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5834                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5835                                  VEX_4V, VEX_WIG;
5836  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5837                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5838                                  VEX_4V, VEX_WIG;
5839  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5840                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5841                                  VEX_4V, VEX_WIG;
5842  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5843                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5844                                  VEX_4V, VEX_WIG;
5845  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5846                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
5847                                  VEX_4V, VEX_WIG;
5848}
5849let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5850  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5851                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5852                                  VEX_4V, VEX_WIG;
5853  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5854                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5855                                  VEX_4V, VEX_WIG;
5856  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5857                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5858                                  VEX_4V, VEX_WIG;
5859  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5860                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5861                                  VEX_4V, VEX_WIG;
5862}
5863
5864let Predicates = [HasAVX2, NoVLX] in {
5865  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5866                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5867                                  VEX_4V, VEX_L, VEX_WIG;
5868  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5869                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5870                                  VEX_4V, VEX_L, VEX_WIG;
5871  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5872                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5873                                  VEX_4V, VEX_L, VEX_WIG;
5874  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5875                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5876                                  VEX_4V, VEX_L, VEX_WIG;
5877  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5878                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
5879                                  VEX_4V, VEX_L, VEX_WIG;
5880}
5881let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5882  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5883                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5884                                  VEX_4V, VEX_L, VEX_WIG;
5885  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5886                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5887                                  VEX_4V, VEX_L, VEX_WIG;
5888  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5889                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5890                                  VEX_4V, VEX_L, VEX_WIG;
5891  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5892                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5893                                  VEX_4V, VEX_L, VEX_WIG;
5894}
5895
5896let Constraints = "$src1 = $dst" in {
5897  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5898                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5899  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5900                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5901  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5902                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5903  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5904                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5905  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5906                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5907  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5908                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5909  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5910                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5911  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5912                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5913  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5914                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5915}
5916
5917let Predicates = [HasAVX, NoVLX] in
5918  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5919                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
5920                                 VEX_4V, VEX_WIG;
5921let Predicates = [HasAVX] in
5922  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5923                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
5924                                 VEX_4V, VEX_WIG;
5925
5926let Predicates = [HasAVX2, NoVLX] in
5927  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5928                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
5929                                  VEX_4V, VEX_L, VEX_WIG;
5930let Predicates = [HasAVX2] in
5931  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5932                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5933                                  VEX_4V, VEX_L, VEX_WIG;
5934
5935let Constraints = "$src1 = $dst" in {
5936  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5937                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
5938  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5939                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
5940}
5941
5942/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5943multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5944                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5945                 X86MemOperand x86memop, bit Is2Addr,
5946                 X86FoldableSchedWrite sched> {
5947  let isCommutable = 1 in
5948  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5949        (ins RC:$src1, RC:$src2, u8imm:$src3),
5950        !if(Is2Addr,
5951            !strconcat(OpcodeStr,
5952                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5953            !strconcat(OpcodeStr,
5954                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5955        [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5956        Sched<[sched]>;
5957  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5958        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5959        !if(Is2Addr,
5960            !strconcat(OpcodeStr,
5961                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5962            !strconcat(OpcodeStr,
5963                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5964        [(set RC:$dst,
5965          (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5966        Sched<[sched.Folded, sched.ReadAfterFold]>;
5967}
5968
5969/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5970multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5971                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5972                           X86MemOperand x86memop, bit Is2Addr,
5973                           X86FoldableSchedWrite sched> {
5974  let isCommutable = 1 in
5975  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5976        (ins RC:$src1, RC:$src2, u8imm:$src3),
5977        !if(Is2Addr,
5978            !strconcat(OpcodeStr,
5979                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5980            !strconcat(OpcodeStr,
5981                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5982        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5983        Sched<[sched]>;
5984  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5985        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5986        !if(Is2Addr,
5987            !strconcat(OpcodeStr,
5988                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5989            !strconcat(OpcodeStr,
5990                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5991        [(set RC:$dst,
5992          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
5993        Sched<[sched.Folded, sched.ReadAfterFold]>;
5994}
5995
5996def BlendCommuteImm2 : SDNodeXForm<timm, [{
5997  uint8_t Imm = N->getZExtValue() & 0x03;
5998  return getI8Imm(Imm ^ 0x03, SDLoc(N));
5999}]>;
6000
6001def BlendCommuteImm4 : SDNodeXForm<timm, [{
6002  uint8_t Imm = N->getZExtValue() & 0x0f;
6003  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
6004}]>;
6005
6006def BlendCommuteImm8 : SDNodeXForm<timm, [{
6007  uint8_t Imm = N->getZExtValue() & 0xff;
6008  return getI8Imm(Imm ^ 0xff, SDLoc(N));
6009}]>;
6010
6011// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
6012def BlendScaleImm4 : SDNodeXForm<timm, [{
6013  uint8_t Imm = N->getZExtValue();
6014  uint8_t NewImm = 0;
6015  for (unsigned i = 0; i != 4; ++i) {
6016    if (Imm & (1 << i))
6017      NewImm |= 0x3 << (i * 2);
6018  }
6019  return getI8Imm(NewImm, SDLoc(N));
6020}]>;
6021
6022// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
6023def BlendScaleImm2 : SDNodeXForm<timm, [{
6024  uint8_t Imm = N->getZExtValue();
6025  uint8_t NewImm = 0;
6026  for (unsigned i = 0; i != 2; ++i) {
6027    if (Imm & (1 << i))
6028      NewImm |= 0xf << (i * 4);
6029  }
6030  return getI8Imm(NewImm, SDLoc(N));
6031}]>;
6032
6033// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
6034def BlendScaleImm2to4 : SDNodeXForm<timm, [{
6035  uint8_t Imm = N->getZExtValue();
6036  uint8_t NewImm = 0;
6037  for (unsigned i = 0; i != 2; ++i) {
6038    if (Imm & (1 << i))
6039      NewImm |= 0x3 << (i * 2);
6040  }
6041  return getI8Imm(NewImm, SDLoc(N));
6042}]>;
6043
6044// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
6045def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
6046  uint8_t Imm = N->getZExtValue();
6047  uint8_t NewImm = 0;
6048  for (unsigned i = 0; i != 4; ++i) {
6049    if (Imm & (1 << i))
6050      NewImm |= 0x3 << (i * 2);
6051  }
6052  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6053}]>;
6054
6055// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
6056def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
6057  uint8_t Imm = N->getZExtValue();
6058  uint8_t NewImm = 0;
6059  for (unsigned i = 0; i != 2; ++i) {
6060    if (Imm & (1 << i))
6061      NewImm |= 0xf << (i * 4);
6062  }
6063  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6064}]>;
6065
6066// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
6067def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
6068  uint8_t Imm = N->getZExtValue();
6069  uint8_t NewImm = 0;
6070  for (unsigned i = 0; i != 2; ++i) {
6071    if (Imm & (1 << i))
6072      NewImm |= 0x3 << (i * 2);
6073  }
6074  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
6075}]>;
6076
6077let Predicates = [HasAVX] in {
6078  let isCommutable = 0 in {
6079    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6080                                        VR128, load, i128mem, 0,
6081                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6082  }
6083
6084let Uses = [MXCSR], mayRaiseFPException = 1 in {
6085  let ExeDomain = SSEPackedSingle in
6086  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6087                                   VR128, load, f128mem, 0,
6088                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6089  let ExeDomain = SSEPackedDouble in
6090  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6091                                   VR128, load, f128mem, 0,
6092                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6093  let ExeDomain = SSEPackedSingle in
6094  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6095                                    VR256, load, i256mem, 0,
6096                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6097}
6098}
6099
6100let Predicates = [HasAVX2] in {
6101  let isCommutable = 0 in {
6102  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6103                                  VR256, load, i256mem, 0,
6104                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6105  }
6106}
6107
6108let Constraints = "$src1 = $dst" in {
6109  let isCommutable = 0 in {
6110  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6111                                     VR128, memop, i128mem, 1,
6112                                     SchedWriteMPSAD.XMM>;
6113  }
6114
6115  let ExeDomain = SSEPackedSingle in
6116  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6117                                  VR128, memop, f128mem, 1,
6118                                  SchedWriteDPPS.XMM>, SIMD_EXC;
6119  let ExeDomain = SSEPackedDouble in
6120  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6121                                  VR128, memop, f128mem, 1,
6122                                  SchedWriteDPPD.XMM>, SIMD_EXC;
6123}
6124
6125/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6126multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6127                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6128                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6129                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6130let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6131  let isCommutable = 1 in
6132  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6133        (ins RC:$src1, RC:$src2, u8imm:$src3),
6134        !if(Is2Addr,
6135            !strconcat(OpcodeStr,
6136                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6137            !strconcat(OpcodeStr,
6138                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6139        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6140        Sched<[sched]>;
6141  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6142        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6143        !if(Is2Addr,
6144            !strconcat(OpcodeStr,
6145                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6146            !strconcat(OpcodeStr,
6147                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6148        [(set RC:$dst,
6149          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6150        Sched<[sched.Folded, sched.ReadAfterFold]>;
6151}
6152
6153  // Pattern to commute if load is in first source.
6154  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6155            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6156                                            (commuteXForm timm:$src3))>;
6157}
6158
6159let Predicates = [HasAVX] in {
6160  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6161                                  VR128, load, f128mem, 0, SSEPackedSingle,
6162                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6163                                  VEX_4V, VEX_WIG;
6164  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6165                                   VR256, load, f256mem, 0, SSEPackedSingle,
6166                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6167                                   VEX_4V, VEX_L, VEX_WIG;
6168  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6169                                  VR128, load, f128mem, 0, SSEPackedDouble,
6170                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6171                                  VEX_4V, VEX_WIG;
6172  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6173                                   VR256, load, f256mem, 0, SSEPackedDouble,
6174                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6175                                   VEX_4V, VEX_L, VEX_WIG;
6176  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6177                                  VR128, load, i128mem, 0, SSEPackedInt,
6178                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6179                                  VEX_4V, VEX_WIG;
6180}
6181
6182let Predicates = [HasAVX2] in {
6183  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6184                                   VR256, load, i256mem, 0, SSEPackedInt,
6185                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6186                                   VEX_4V, VEX_L, VEX_WIG;
6187}
6188
6189// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6190// ExecutionDomainFixPass will cleanup domains later on.
6191let Predicates = [HasAVX1Only] in {
6192def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6193          (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6194def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6195          (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6196def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6197          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6198
6199// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6200// it from becoming movsd via commuting under optsize.
6201def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6202          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6203def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6204          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6205def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6206          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6207
6208def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6209          (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6210def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6211          (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6212def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6213          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6214
6215// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6216// it from becoming movss via commuting under optsize.
6217def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6218          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6219def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6220          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6221def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6222          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6223}
6224
6225defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6226                               VR128, memop, f128mem, 1, SSEPackedSingle,
6227                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6228defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6229                               VR128, memop, f128mem, 1, SSEPackedDouble,
6230                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6231defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6232                               VR128, memop, i128mem, 1, SSEPackedInt,
6233                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6234
6235let Predicates = [UseSSE41] in {
6236// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6237// it from becoming movss via commuting under optsize.
6238def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6239          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6240def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6241          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6242def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6243          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6244
6245def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6246          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6247def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6248          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6249def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6250          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6251}
6252
6253// For insertion into the zero index (low half) of a 256-bit vector, it is
6254// more efficient to generate a blend with immediate instead of an insert*128.
6255let Predicates = [HasAVX] in {
6256def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6257          (VBLENDPDYrri VR256:$src1,
6258                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6259                                       VR128:$src2, sub_xmm), 0x3)>;
6260def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6261          (VBLENDPSYrri VR256:$src1,
6262                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6263                                       VR128:$src2, sub_xmm), 0xf)>;
6264
6265def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6266          (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6267                                       VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6268def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6269          (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6270                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6271}
6272
6273/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6274multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6275                                X86MemOperand x86memop, ValueType VT,
6276                                PatFrag mem_frag, SDNode OpNode,
6277                                X86FoldableSchedWrite sched> {
6278  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6279                  (ins RC:$src1, RC:$src2, RC:$src3),
6280                  !strconcat(OpcodeStr,
6281                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6282                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6283                  SSEPackedInt>, TAPD, VEX_4V,
6284                Sched<[sched]>;
6285
6286  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6287                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6288                  !strconcat(OpcodeStr,
6289                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6290                  [(set RC:$dst,
6291                        (OpNode RC:$src3, (mem_frag addr:$src2),
6292                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6293                Sched<[sched.Folded, sched.ReadAfterFold,
6294                       // x86memop:$src2
6295                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6296                       ReadDefault,
6297                       // RC::$src3
6298                       sched.ReadAfterFold]>;
6299}
6300
6301let Predicates = [HasAVX] in {
6302let ExeDomain = SSEPackedDouble in {
6303defm VBLENDVPD  : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6304                                       v2f64, loadv2f64, X86Blendv,
6305                                       SchedWriteFVarBlend.XMM>;
6306defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6307                                       v4f64, loadv4f64, X86Blendv,
6308                                       SchedWriteFVarBlend.YMM>, VEX_L;
6309} // ExeDomain = SSEPackedDouble
6310let ExeDomain = SSEPackedSingle in {
6311defm VBLENDVPS  : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6312                                       v4f32, loadv4f32, X86Blendv,
6313                                       SchedWriteFVarBlend.XMM>;
6314defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6315                                       v8f32, loadv8f32, X86Blendv,
6316                                       SchedWriteFVarBlend.YMM>, VEX_L;
6317} // ExeDomain = SSEPackedSingle
6318defm VPBLENDVB  : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6319                                       v16i8, loadv16i8, X86Blendv,
6320                                       SchedWriteVarBlend.XMM>;
6321}
6322
6323let Predicates = [HasAVX2] in {
6324defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6325                                       v32i8, loadv32i8, X86Blendv,
6326                                       SchedWriteVarBlend.YMM>, VEX_L;
6327}
6328
6329let Predicates = [HasAVX] in {
6330  def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6331                              (v4i32 VR128:$src2))),
6332            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6333  def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6334                              (v2i64 VR128:$src2))),
6335            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6336  def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6337                              (v8i32 VR256:$src2))),
6338            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6339  def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6340                              (v4i64 VR256:$src2))),
6341            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6342}
6343
6344// Prefer a movss or movsd over a blendps when optimizing for size. these were
6345// changed to use blends because blends have better throughput on sandybridge
6346// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6347let Predicates = [HasAVX, OptForSpeed] in {
6348  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6349            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6350  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6351            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6352
6353  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6354            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6355  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6356            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6357  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6358            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6359
6360  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6361            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6362  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6363            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6364  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6365            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6366
6367  // Move low f32 and clear high bits.
6368  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6369            (SUBREG_TO_REG (i32 0),
6370             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6371                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6372                          (i8 1))), sub_xmm)>;
6373  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6374            (SUBREG_TO_REG (i32 0),
6375             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6376                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6377                          (i8 3))), sub_xmm)>;
6378}
6379
6380// Prefer a movss or movsd over a blendps when optimizing for size. these were
6381// changed to use blends because blends have better throughput on sandybridge
6382// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6383let Predicates = [UseSSE41, OptForSpeed] in {
6384  // With SSE41 we can use blends for these patterns.
6385  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6386            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6387  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6388            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6389
6390  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6391            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6392  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6393            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6394  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6395            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6396
6397  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6398            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6399  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6400            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6401  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6402            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6403}
6404
6405
6406/// SS41I_ternary - SSE 4.1 ternary operator
6407let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6408  multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6409                           PatFrag mem_frag, X86MemOperand x86memop,
6410                           SDNode OpNode, X86FoldableSchedWrite sched> {
6411    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6412                    (ins VR128:$src1, VR128:$src2),
6413                    !strconcat(OpcodeStr,
6414                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6415                    [(set VR128:$dst,
6416                      (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6417                    Sched<[sched]>;
6418
6419    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6420                    (ins VR128:$src1, x86memop:$src2),
6421                    !strconcat(OpcodeStr,
6422                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6423                    [(set VR128:$dst,
6424                      (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6425                    Sched<[sched.Folded, sched.ReadAfterFold]>;
6426  }
6427}
6428
6429let ExeDomain = SSEPackedDouble in
6430defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6431                              X86Blendv, SchedWriteFVarBlend.XMM>;
6432let ExeDomain = SSEPackedSingle in
6433defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6434                              X86Blendv, SchedWriteFVarBlend.XMM>;
6435defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6436                              X86Blendv, SchedWriteVarBlend.XMM>;
6437
6438// Aliases with the implicit xmm0 argument
6439def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6440                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6441def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6442                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6443def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6444                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6445def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6446                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6447def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6448                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6449def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6450                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6451
6452let Predicates = [UseSSE41] in {
6453  def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6454                              (v4i32 VR128:$src2))),
6455            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6456  def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6457                              (v2i64 VR128:$src2))),
6458            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6459}
6460
6461let AddedComplexity = 400 in { // Prefer non-temporal versions
6462
6463let Predicates = [HasAVX, NoVLX] in
6464def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6465                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6466                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6467let Predicates = [HasAVX2, NoVLX] in
6468def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6469                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6470                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6471def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6472                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6473                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6474
6475let Predicates = [HasAVX2, NoVLX] in {
6476  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6477            (VMOVNTDQAYrm addr:$src)>;
6478  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6479            (VMOVNTDQAYrm addr:$src)>;
6480  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6481            (VMOVNTDQAYrm addr:$src)>;
6482  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6483            (VMOVNTDQAYrm addr:$src)>;
6484  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6485            (VMOVNTDQAYrm addr:$src)>;
6486  def : Pat<(v16f16 (alignednontemporalload addr:$src)),
6487            (VMOVNTDQAYrm addr:$src)>;
6488  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6489            (VMOVNTDQAYrm addr:$src)>;
6490}
6491
6492let Predicates = [HasAVX, NoVLX] in {
6493  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6494            (VMOVNTDQArm addr:$src)>;
6495  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6496            (VMOVNTDQArm addr:$src)>;
6497  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6498            (VMOVNTDQArm addr:$src)>;
6499  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6500            (VMOVNTDQArm addr:$src)>;
6501  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6502            (VMOVNTDQArm addr:$src)>;
6503  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6504            (VMOVNTDQArm addr:$src)>;
6505  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6506            (VMOVNTDQArm addr:$src)>;
6507}
6508
6509let Predicates = [UseSSE41] in {
6510  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6511            (MOVNTDQArm addr:$src)>;
6512  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6513            (MOVNTDQArm addr:$src)>;
6514  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6515            (MOVNTDQArm addr:$src)>;
6516  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6517            (MOVNTDQArm addr:$src)>;
6518  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6519            (MOVNTDQArm addr:$src)>;
6520  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6521            (MOVNTDQArm addr:$src)>;
6522  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6523            (MOVNTDQArm addr:$src)>;
6524}
6525
6526} // AddedComplexity
6527
6528//===----------------------------------------------------------------------===//
6529// SSE4.2 - Compare Instructions
6530//===----------------------------------------------------------------------===//
6531
6532/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6533multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6534                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6535                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6536                          bit Is2Addr = 1> {
6537  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6538       (ins RC:$src1, RC:$src2),
6539       !if(Is2Addr,
6540           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6541           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6542       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6543       Sched<[sched]>;
6544  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6545       (ins RC:$src1, x86memop:$src2),
6546       !if(Is2Addr,
6547           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6548           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6549       [(set RC:$dst,
6550         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6551       Sched<[sched.Folded, sched.ReadAfterFold]>;
6552}
6553
6554let Predicates = [HasAVX] in
6555  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6556                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
6557                                 VEX_4V, VEX_WIG;
6558
6559let Predicates = [HasAVX2] in
6560  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6561                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
6562                                  VEX_4V, VEX_L, VEX_WIG;
6563
6564let Constraints = "$src1 = $dst" in
6565  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6566                                memop, i128mem, SchedWriteVecALU.XMM>;
6567
6568//===----------------------------------------------------------------------===//
6569// SSE4.2 - String/text Processing Instructions
6570//===----------------------------------------------------------------------===//
6571
6572multiclass pcmpistrm_SS42AI<string asm> {
6573  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6574    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6575    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6576    []>, Sched<[WritePCmpIStrM]>;
6577  let mayLoad = 1 in
6578  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6579    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6580    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6581    []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6582}
6583
6584let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6585  let Predicates = [HasAVX] in
6586  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
6587  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6588}
6589
6590multiclass SS42AI_pcmpestrm<string asm> {
6591  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6592    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6593    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6594    []>, Sched<[WritePCmpEStrM]>;
6595  let mayLoad = 1 in
6596  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6597    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6598    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6599    []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6600}
6601
6602let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6603  let Predicates = [HasAVX] in
6604  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
6605  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6606}
6607
6608multiclass SS42AI_pcmpistri<string asm> {
6609  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6610    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6611    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6612    []>, Sched<[WritePCmpIStrI]>;
6613  let mayLoad = 1 in
6614  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6615    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6616    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6617    []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6618}
6619
6620let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6621  let Predicates = [HasAVX] in
6622  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
6623  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6624}
6625
6626multiclass SS42AI_pcmpestri<string asm> {
6627  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6628    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6629    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6630    []>, Sched<[WritePCmpEStrI]>;
6631  let mayLoad = 1 in
6632  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6633    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6634    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6635    []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6636}
6637
6638let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6639  let Predicates = [HasAVX] in
6640  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
6641  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6642}
6643
6644//===----------------------------------------------------------------------===//
6645// SSE4.2 - CRC Instructions
6646//===----------------------------------------------------------------------===//
6647
6648// No CRC instructions have AVX equivalents
6649
6650// crc intrinsic instruction
6651// This set of instructions are only rm, the only difference is the size
6652// of r and m.
6653class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6654                   RegisterClass RCIn, SDPatternOperator Int> :
6655  CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6656         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6657         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6658         Sched<[WriteCRC32]>;
6659
6660class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6661                   X86MemOperand x86memop, SDPatternOperator Int> :
6662  CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6663         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6664         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6665         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6666
6667let Constraints = "$src1 = $dst" in {
6668  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6669                                 int_x86_sse42_crc32_32_8>;
6670  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6671                                 int_x86_sse42_crc32_32_8>;
6672  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6673                                 int_x86_sse42_crc32_32_16>, OpSize16;
6674  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6675                                 int_x86_sse42_crc32_32_16>, OpSize16;
6676  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6677                                 int_x86_sse42_crc32_32_32>, OpSize32;
6678  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6679                                 int_x86_sse42_crc32_32_32>, OpSize32;
6680  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6681                                 int_x86_sse42_crc32_64_64>, REX_W;
6682  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6683                                 int_x86_sse42_crc32_64_64>, REX_W;
6684  let hasSideEffects = 0 in {
6685    let mayLoad = 1 in
6686    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6687                                   null_frag>, REX_W;
6688    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6689                                   null_frag>, REX_W;
6690  }
6691}
6692
6693//===----------------------------------------------------------------------===//
6694// SHA-NI Instructions
6695//===----------------------------------------------------------------------===//
6696
6697// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6698multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6699                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6700  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6701             (ins VR128:$src1, VR128:$src2),
6702             !if(UsesXMM0,
6703                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6704                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6705             [!if(UsesXMM0,
6706                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6707                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6708             T8PS, Sched<[sched]>;
6709
6710  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6711             (ins VR128:$src1, i128mem:$src2),
6712             !if(UsesXMM0,
6713                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6714                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6715             [!if(UsesXMM0,
6716                  (set VR128:$dst, (IntId VR128:$src1,
6717                    (memop addr:$src2), XMM0)),
6718                  (set VR128:$dst, (IntId VR128:$src1,
6719                    (memop addr:$src2))))]>, T8PS,
6720             Sched<[sched.Folded, sched.ReadAfterFold]>;
6721}
6722
6723let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6724  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6725                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6726                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6727                         [(set VR128:$dst,
6728                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6729                            (i8 timm:$src3)))]>, TAPS,
6730                         Sched<[SchedWriteVecIMul.XMM]>;
6731  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6732                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6733                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6734                         [(set VR128:$dst,
6735                           (int_x86_sha1rnds4 VR128:$src1,
6736                            (memop addr:$src2),
6737                            (i8 timm:$src3)))]>, TAPS,
6738                         Sched<[SchedWriteVecIMul.XMM.Folded,
6739                                SchedWriteVecIMul.XMM.ReadAfterFold]>;
6740
6741  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6742                              SchedWriteVecIMul.XMM>;
6743  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6744                              SchedWriteVecIMul.XMM>;
6745  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6746                              SchedWriteVecIMul.XMM>;
6747
6748  let Uses=[XMM0] in
6749  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6750                                SchedWriteVecIMul.XMM, 1>;
6751
6752  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6753                               SchedWriteVecIMul.XMM>;
6754  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6755                               SchedWriteVecIMul.XMM>;
6756}
6757
6758// Aliases with explicit %xmm0
6759def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6760                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6761def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6762                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6763
6764//===----------------------------------------------------------------------===//
6765// AES-NI Instructions
6766//===----------------------------------------------------------------------===//
6767
6768multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6769                             Intrinsic IntId, PatFrag ld_frag,
6770                             bit Is2Addr = 0, RegisterClass RC = VR128,
6771                             X86MemOperand MemOp = i128mem> {
6772  let AsmString = OpcodeStr#
6773                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6774                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6775    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6776                   (ins RC:$src1, RC:$src2), "",
6777                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6778                   Sched<[WriteAESDecEnc]>;
6779    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6780                   (ins RC:$src1, MemOp:$src2), "",
6781                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6782                   Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6783  }
6784}
6785
6786// Perform One Round of an AES Encryption/Decryption Flow
6787let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6788  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6789                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6790  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6791                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6792  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6793                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6794  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6795                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6796}
6797
6798let Predicates = [NoVLX, HasVAES] in {
6799  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6800                         int_x86_aesni_aesenc_256, load, 0, VR256,
6801                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6802  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6803                         int_x86_aesni_aesenclast_256, load, 0, VR256,
6804                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6805  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6806                         int_x86_aesni_aesdec_256, load, 0, VR256,
6807                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6808  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6809                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
6810                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6811}
6812
6813let Constraints = "$src1 = $dst" in {
6814  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6815                         int_x86_aesni_aesenc, memop, 1>;
6816  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6817                         int_x86_aesni_aesenclast, memop, 1>;
6818  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6819                         int_x86_aesni_aesdec, memop, 1>;
6820  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6821                         int_x86_aesni_aesdeclast, memop, 1>;
6822}
6823
6824// Perform the AES InvMixColumn Transformation
6825let Predicates = [HasAVX, HasAES] in {
6826  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6827      (ins VR128:$src1),
6828      "vaesimc\t{$src1, $dst|$dst, $src1}",
6829      [(set VR128:$dst,
6830        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6831      VEX, VEX_WIG;
6832  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6833      (ins i128mem:$src1),
6834      "vaesimc\t{$src1, $dst|$dst, $src1}",
6835      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6836      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6837}
6838def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6839  (ins VR128:$src1),
6840  "aesimc\t{$src1, $dst|$dst, $src1}",
6841  [(set VR128:$dst,
6842    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6843def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6844  (ins i128mem:$src1),
6845  "aesimc\t{$src1, $dst|$dst, $src1}",
6846  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6847  Sched<[WriteAESIMC.Folded]>;
6848
6849// AES Round Key Generation Assist
6850let Predicates = [HasAVX, HasAES] in {
6851  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6852      (ins VR128:$src1, u8imm:$src2),
6853      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6854      [(set VR128:$dst,
6855        (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6856      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6857  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6858      (ins i128mem:$src1, u8imm:$src2),
6859      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6860      [(set VR128:$dst,
6861        (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6862      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6863}
6864def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6865  (ins VR128:$src1, u8imm:$src2),
6866  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6867  [(set VR128:$dst,
6868    (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6869  Sched<[WriteAESKeyGen]>;
6870def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6871  (ins i128mem:$src1, u8imm:$src2),
6872  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6873  [(set VR128:$dst,
6874    (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6875  Sched<[WriteAESKeyGen.Folded]>;
6876
6877//===----------------------------------------------------------------------===//
6878// PCLMUL Instructions
6879//===----------------------------------------------------------------------===//
6880
6881// Immediate transform to help with commuting.
6882def PCLMULCommuteImm : SDNodeXForm<timm, [{
6883  uint8_t Imm = N->getZExtValue();
6884  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6885}]>;
6886
6887// SSE carry-less Multiplication instructions
6888let Predicates = [NoAVX, HasPCLMUL] in {
6889  let Constraints = "$src1 = $dst" in {
6890    let isCommutable = 1 in
6891    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6892              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6893              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6894              [(set VR128:$dst,
6895                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6896                Sched<[WriteCLMul]>;
6897
6898    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6899              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6900              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6901              [(set VR128:$dst,
6902                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6903                  timm:$src3))]>,
6904              Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6905  } // Constraints = "$src1 = $dst"
6906
6907  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6908                                (i8 timm:$src3)),
6909            (PCLMULQDQrm VR128:$src1, addr:$src2,
6910                          (PCLMULCommuteImm timm:$src3))>;
6911} // Predicates = [NoAVX, HasPCLMUL]
6912
6913// SSE aliases
6914foreach HI = ["hq","lq"] in
6915foreach LO = ["hq","lq"] in {
6916  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6917                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6918                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6919  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6920                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6921                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6922}
6923
6924// AVX carry-less Multiplication instructions
6925multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6926                      PatFrag LdFrag, Intrinsic IntId> {
6927  let isCommutable = 1 in
6928  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6929            (ins RC:$src1, RC:$src2, u8imm:$src3),
6930            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6931            [(set RC:$dst,
6932              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6933            Sched<[WriteCLMul]>;
6934
6935  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6936            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6937            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6938            [(set RC:$dst,
6939               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6940            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6941
6942  // We can commute a load in the first operand by swapping the sources and
6943  // rotating the immediate.
6944  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6945            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6946                                           (PCLMULCommuteImm timm:$src3))>;
6947}
6948
6949let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6950defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6951                             int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6952
6953let Predicates = [NoVLX, HasVPCLMULQDQ] in
6954defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6955                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6956
6957multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6958                                   X86MemOperand MemOp, string Hi, string Lo> {
6959  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6960                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6961                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6962  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6963                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6964                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6965}
6966
6967multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6968                              X86MemOperand MemOp> {
6969  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6970  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6971  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6972  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6973}
6974
6975// AVX aliases
6976defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6977defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6978
6979//===----------------------------------------------------------------------===//
6980// SSE4A Instructions
6981//===----------------------------------------------------------------------===//
6982
6983let Predicates = [HasSSE4A] in {
6984
6985let ExeDomain = SSEPackedInt in {
6986let Constraints = "$src = $dst" in {
6987def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6988                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6989                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6990                 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
6991                                    timm:$idx))]>,
6992                 PD, Sched<[SchedWriteVecALU.XMM]>;
6993def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6994              (ins VR128:$src, VR128:$mask),
6995              "extrq\t{$mask, $src|$src, $mask}",
6996              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6997                                 VR128:$mask))]>,
6998              PD, Sched<[SchedWriteVecALU.XMM]>;
6999
7000def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7001                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7002                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7003                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7004                                      timm:$len, timm:$idx))]>,
7005                   XD, Sched<[SchedWriteVecALU.XMM]>;
7006def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7007                 (ins VR128:$src, VR128:$mask),
7008                 "insertq\t{$mask, $src|$src, $mask}",
7009                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7010                                    VR128:$mask))]>,
7011                 XD, Sched<[SchedWriteVecALU.XMM]>;
7012}
7013} // ExeDomain = SSEPackedInt
7014
7015// Non-temporal (unaligned) scalar stores.
7016let AddedComplexity = 400 in { // Prefer non-temporal versions
7017let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
7018def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7019                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
7020
7021def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7022                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
7023} // SchedRW
7024
7025def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7026          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7027
7028def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7029          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7030
7031} // AddedComplexity
7032} // HasSSE4A
7033
7034//===----------------------------------------------------------------------===//
7035// AVX Instructions
7036//===----------------------------------------------------------------------===//
7037
7038//===----------------------------------------------------------------------===//
7039// VBROADCAST - Load from memory and broadcast to all elements of the
7040//              destination operand
7041//
7042class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7043                           X86MemOperand x86memop, ValueType VT,
7044                           PatFrag bcast_frag, SchedWrite Sched> :
7045  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7046        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7047        [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
7048        Sched<[Sched]>, VEX;
7049
7050// AVX2 adds register forms
7051class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7052                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7053  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7054         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7055         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7056         Sched<[Sched]>, VEX;
7057
7058let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7059  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7060                                         f32mem, v4f32, X86VBroadcastld32,
7061                                         SchedWriteFShuffle.XMM.Folded>;
7062  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7063                                         f32mem, v8f32, X86VBroadcastld32,
7064                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
7065}
7066let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7067def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7068                                        v4f64, X86VBroadcastld64,
7069                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
7070
7071let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7072  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7073                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
7074  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7075                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
7076}
7077let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7078def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7079                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
7080
7081//===----------------------------------------------------------------------===//
7082// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7083//                  halves of a 256-bit vector.
7084//
7085let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7086def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7087                           (ins i128mem:$src),
7088                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7089                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
7090
7091let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7092    ExeDomain = SSEPackedSingle in
7093def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7094                           (ins f128mem:$src),
7095                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7096                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7097
7098let Predicates = [HasAVX, NoVLX] in {
7099def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
7100          (VBROADCASTF128 addr:$src)>;
7101def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
7102          (VBROADCASTF128 addr:$src)>;
7103// NOTE: We're using FP instructions here, but execution domain fixing can
7104// convert to integer when profitable.
7105def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
7106          (VBROADCASTF128 addr:$src)>;
7107def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
7108          (VBROADCASTF128 addr:$src)>;
7109def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
7110          (VBROADCASTF128 addr:$src)>;
7111def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
7112          (VBROADCASTF128 addr:$src)>;
7113def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
7114          (VBROADCASTF128 addr:$src)>;
7115}
7116
7117//===----------------------------------------------------------------------===//
7118// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7119//
7120
7121let ExeDomain = SSEPackedSingle in {
7122let isCommutable = 1 in
7123def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7124          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7125          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7126          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
7127def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7128          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7129          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7130          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7131}
7132
7133// Immediate transform to help with commuting.
7134def Perm2XCommuteImm : SDNodeXForm<timm, [{
7135  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7136}]>;
7137
7138multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
7139  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7140            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
7141  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
7142            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
7143  // Pattern with load in other operand.
7144  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
7145            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7146                                             (Perm2XCommuteImm timm:$imm))>;
7147}
7148
7149let Predicates = [HasAVX] in {
7150  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
7151  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
7152}
7153
7154let Predicates = [HasAVX1Only] in {
7155  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
7156  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
7157  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
7158  defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
7159  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
7160}
7161
7162//===----------------------------------------------------------------------===//
7163// VINSERTF128 - Insert packed floating-point values
7164//
7165let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7166def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7167          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7168          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7169          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7170let mayLoad = 1 in
7171def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7172          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7173          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7174          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7175}
7176
7177// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7178// with YMM register containing zero.
7179// FIXME: Avoid producing vxorps to clear the fake inputs.
7180let Predicates = [HasAVX1Only] in {
7181def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7182}
7183
7184multiclass vinsert_lowering<string InstrStr, string PermStr,
7185                            ValueType From, ValueType To,
7186                            PatFrag frommemop_frag, PatFrag tomemop_frag> {
7187  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7188                                   (iPTR imm)),
7189            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7190                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7191  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7192                                    (From (frommemop_frag addr:$src2)),
7193                                    (iPTR imm)),
7194            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7195                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7196  // Folding "To" vector - convert to perm2x128 and commute inputs.
7197  def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
7198                                    (From VR128:$src2),
7199                                    (iPTR imm)),
7200            (!cast<Instruction>(PermStr#rm)
7201              (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
7202              addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
7203}
7204
7205let Predicates = [HasAVX, NoVLX] in {
7206  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
7207  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
7208}
7209
7210let Predicates = [HasAVX1Only] in {
7211  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64,  loadv2i64, loadv4i64>;
7212  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32,  loadv4i32, loadv8i32>;
7213  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
7214  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
7215  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7216  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7217}
7218
7219//===----------------------------------------------------------------------===//
7220// VEXTRACTF128 - Extract packed floating-point values
7221//
7222let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7223def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7224          (ins VR256:$src1, u8imm:$src2),
7225          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7226          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7227let mayStore = 1 in
7228def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7229          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7230          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7231          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7232}
7233
7234multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7235  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7236            (To (!cast<Instruction>(InstrStr#rr)
7237                                    (From VR256:$src1),
7238                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7239  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7240                                                 (iPTR imm))), addr:$dst),
7241            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7242             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7243}
7244
7245// AVX1 patterns
7246let Predicates = [HasAVX, NoVLX] in {
7247  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7248  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7249}
7250
7251let Predicates = [HasAVX1Only] in {
7252  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7253  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7254  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7255  defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
7256  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7257  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7258}
7259
7260//===----------------------------------------------------------------------===//
7261// VMASKMOV - Conditional SIMD Packed Loads and Stores
7262//
7263multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7264                          Intrinsic IntLd, Intrinsic IntLd256,
7265                          Intrinsic IntSt, Intrinsic IntSt256,
7266                          X86SchedWriteMaskMove schedX,
7267                          X86SchedWriteMaskMove schedY> {
7268  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7269             (ins VR128:$src1, f128mem:$src2),
7270             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7271             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7272             VEX_4V, Sched<[schedX.RM]>;
7273  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7274             (ins VR256:$src1, f256mem:$src2),
7275             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7276             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7277             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7278  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7279             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7280             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7281             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7282             VEX_4V, Sched<[schedX.MR]>;
7283  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7284             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7285             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7286             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7287             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7288}
7289
7290let ExeDomain = SSEPackedSingle in
7291defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7292                                 int_x86_avx_maskload_ps,
7293                                 int_x86_avx_maskload_ps_256,
7294                                 int_x86_avx_maskstore_ps,
7295                                 int_x86_avx_maskstore_ps_256,
7296                                 WriteFMaskMove32, WriteFMaskMove32Y>;
7297let ExeDomain = SSEPackedDouble in
7298defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7299                                 int_x86_avx_maskload_pd,
7300                                 int_x86_avx_maskload_pd_256,
7301                                 int_x86_avx_maskstore_pd,
7302                                 int_x86_avx_maskstore_pd_256,
7303                                 WriteFMaskMove64, WriteFMaskMove64Y>;
7304
7305//===----------------------------------------------------------------------===//
7306// AVX_VNNI
7307//===----------------------------------------------------------------------===//
7308let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
7309    ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
7310multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7311                       bit IsCommutable> {
7312  let isCommutable = IsCommutable in
7313  def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
7314             (ins VR128:$src1, VR128:$src2, VR128:$src3),
7315             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7316             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
7317                                       VR128:$src2, VR128:$src3)))]>,
7318             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7319
7320  def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
7321             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
7322             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7323             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
7324                                      (loadv4i32 addr:$src3))))]>,
7325             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7326
7327  let isCommutable = IsCommutable in
7328  def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
7329             (ins VR256:$src1, VR256:$src2, VR256:$src3),
7330             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7331             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
7332                                       VR256:$src2, VR256:$src3)))]>,
7333             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
7334
7335  def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
7336             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
7337             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7338             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
7339                                      (loadv8i32 addr:$src3))))]>,
7340             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
7341}
7342
7343defm VPDPBUSD   : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
7344defm VPDPBUSDS  : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
7345defm VPDPWSSD   : avx_vnni_rm<0x52, "vpdpwssd",  X86Vpdpwssd, 1>;
7346defm VPDPWSSDS  : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
7347
7348def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
7349                             (X86vpmaddwd node:$lhs, node:$rhs), [{
7350  return N->hasOneUse();
7351}]>;
7352
7353let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
7354  def : Pat<(v8i32 (add VR256:$src1,
7355                        (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
7356            (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
7357  def : Pat<(v8i32 (add VR256:$src1,
7358                        (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
7359            (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
7360  def : Pat<(v4i32 (add VR128:$src1,
7361                        (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
7362            (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
7363  def : Pat<(v4i32 (add VR128:$src1,
7364                        (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
7365            (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
7366}
7367
7368//===----------------------------------------------------------------------===//
7369// VPERMIL - Permute Single and Double Floating-Point Values
7370//
7371
7372multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7373                      RegisterClass RC, X86MemOperand x86memop_f,
7374                      X86MemOperand x86memop_i,
7375                      ValueType f_vt, ValueType i_vt,
7376                      X86FoldableSchedWrite sched,
7377                      X86FoldableSchedWrite varsched> {
7378  let Predicates = [HasAVX, NoVLX] in {
7379    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7380               (ins RC:$src1, RC:$src2),
7381               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7382               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7383               Sched<[varsched]>;
7384    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7385               (ins RC:$src1, x86memop_i:$src2),
7386               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7387               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7388                              (i_vt (load addr:$src2)))))]>, VEX_4V,
7389               Sched<[varsched.Folded, sched.ReadAfterFold]>;
7390
7391    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7392             (ins RC:$src1, u8imm:$src2),
7393             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7394             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7395             Sched<[sched]>;
7396    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7397             (ins x86memop_f:$src1, u8imm:$src2),
7398             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7399             [(set RC:$dst,
7400               (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7401             Sched<[sched.Folded]>;
7402  }// Predicates = [HasAVX, NoVLX]
7403}
7404
7405let ExeDomain = SSEPackedSingle in {
7406  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7407                               v4f32, v4i32, SchedWriteFShuffle.XMM,
7408                               SchedWriteFVarShuffle.XMM>;
7409  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7410                               v8f32, v8i32, SchedWriteFShuffle.YMM,
7411                               SchedWriteFVarShuffle.YMM>, VEX_L;
7412}
7413let ExeDomain = SSEPackedDouble in {
7414  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7415                               v2f64, v2i64, SchedWriteFShuffle.XMM,
7416                               SchedWriteFVarShuffle.XMM>;
7417  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7418                               v4f64, v4i64, SchedWriteFShuffle.YMM,
7419                               SchedWriteFVarShuffle.YMM>, VEX_L;
7420}
7421
7422//===----------------------------------------------------------------------===//
7423// VZERO - Zero YMM registers
7424// Note: These instruction do not affect the YMM16-YMM31.
7425//
7426
7427let SchedRW = [WriteSystem] in {
7428let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7429            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7430  // Zero All YMM registers
7431  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7432                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7433                  Requires<[HasAVX]>, VEX_WIG;
7434
7435  // Zero Upper bits of YMM registers
7436  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7437                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7438                     Requires<[HasAVX]>, VEX_WIG;
7439} // Defs
7440} // SchedRW
7441
7442//===----------------------------------------------------------------------===//
7443// Half precision conversion instructions
7444//
7445
7446multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7447                      X86FoldableSchedWrite sched> {
7448  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7449             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7450             [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
7451             T8PD, VEX, Sched<[sched]>;
7452  let hasSideEffects = 0, mayLoad = 1 in
7453  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7454             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7455             []>, T8PD, VEX, Sched<[sched.Folded]>;
7456}
7457
7458multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7459                      SchedWrite RR, SchedWrite MR> {
7460  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7461               (ins RC:$src1, i32u8imm:$src2),
7462               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7463               [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
7464               TAPD, VEX, Sched<[RR]>;
7465  let hasSideEffects = 0, mayStore = 1 in
7466  def mr : Ii8<0x1D, MRMDestMem, (outs),
7467               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7468               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7469               TAPD, VEX, Sched<[MR]>;
7470}
7471
7472let Predicates = [HasF16C, NoVLX] in {
7473  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7474  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7475  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7476                               WriteCvtPS2PHSt>, SIMD_EXC;
7477  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7478                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7479
7480  // Pattern match vcvtph2ps of a scalar i64 load.
7481  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7482            (VCVTPH2PSrm addr:$src)>;
7483  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
7484              (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7485            (VCVTPH2PSrm addr:$src)>;
7486  def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
7487            (VCVTPH2PSYrm addr:$src)>;
7488
7489  def : Pat<(store (f64 (extractelt
7490                         (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7491                         (iPTR 0))), addr:$dst),
7492            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7493  def : Pat<(store (i64 (extractelt
7494                         (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7495                         (iPTR 0))), addr:$dst),
7496            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7497  def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7498            (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7499}
7500
7501//===----------------------------------------------------------------------===//
7502// AVX2 Instructions
7503//===----------------------------------------------------------------------===//
7504
7505/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7506multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7507                          ValueType OpVT, X86FoldableSchedWrite sched,
7508                          RegisterClass RC,
7509                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7510  let isCommutable = 1 in
7511  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7512        (ins RC:$src1, RC:$src2, u8imm:$src3),
7513        !strconcat(OpcodeStr,
7514            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7515        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7516        Sched<[sched]>, VEX_4V;
7517  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7518        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7519        !strconcat(OpcodeStr,
7520            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7521        [(set RC:$dst,
7522          (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7523        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7524
7525  // Pattern to commute if load is in first source.
7526  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7527            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7528                                            (commuteXForm timm:$src3))>;
7529}
7530
7531let Predicates = [HasAVX2] in {
7532defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7533                               SchedWriteBlend.XMM, VR128, i128mem,
7534                               BlendCommuteImm4>;
7535defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7536                                SchedWriteBlend.YMM, VR256, i256mem,
7537                                BlendCommuteImm8>, VEX_L;
7538
7539def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7540          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7541def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7542          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7543def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7544          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7545
7546def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7547          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7548def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7549          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7550def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7551          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7552}
7553
7554// For insertion into the zero index (low half) of a 256-bit vector, it is
7555// more efficient to generate a blend with immediate instead of an insert*128.
7556// NOTE: We're using FP instructions here, but execution domain fixing should
7557// take care of using integer instructions when profitable.
7558let Predicates = [HasAVX] in {
7559def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7560          (VBLENDPSYrri VR256:$src1,
7561                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7562                                       VR128:$src2, sub_xmm), 0xf)>;
7563def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7564          (VBLENDPSYrri VR256:$src1,
7565                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7566                                       VR128:$src2, sub_xmm), 0xf)>;
7567def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7568          (VBLENDPSYrri VR256:$src1,
7569                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7570                                       VR128:$src2, sub_xmm), 0xf)>;
7571def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
7572          (VBLENDPSYrri VR256:$src1,
7573                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7574                                       VR128:$src2, sub_xmm), 0xf)>;
7575def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7576          (VBLENDPSYrri VR256:$src1,
7577                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7578                                       VR128:$src2, sub_xmm), 0xf)>;
7579
7580def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7581          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7582                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7583def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7584          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7585                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7586def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7587          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7588                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7589def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
7590          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7591                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7592def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7593          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7594                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7595}
7596
7597//===----------------------------------------------------------------------===//
7598// VPBROADCAST - Load from memory and broadcast to all elements of the
7599//               destination operand
7600//
7601multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7602                          X86MemOperand x86memop, PatFrag bcast_frag,
7603                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7604  let Predicates = [HasAVX2, prd] in {
7605    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7606                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7607                  [(set VR128:$dst,
7608                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7609                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7610    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7611                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7612                  [(set VR128:$dst,
7613                   (OpVT128 (bcast_frag addr:$src)))]>,
7614                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7615    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7616                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7617                   [(set VR256:$dst,
7618                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7619                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7620    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7621                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7622                   [(set VR256:$dst,
7623                    (OpVT256 (bcast_frag addr:$src)))]>,
7624                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7625
7626    // Provide aliases for broadcast from the same register class that
7627    // automatically does the extract.
7628    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7629              (!cast<Instruction>(NAME#"Yrr")
7630                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7631  }
7632}
7633
7634defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7635                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7636defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7637                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7638defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7639                                    v4i32, v8i32, NoVLX>;
7640defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7641                                    v2i64, v4i64, NoVLX>;
7642
7643let Predicates = [HasAVX2, NoVLX] in {
7644  // Provide fallback in case the load node that is used in the patterns above
7645  // is used by additional users, which prevents the pattern selection.
7646    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7647              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7648    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7649              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7650    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7651              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7652}
7653
7654let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7655  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7656        (VPBROADCASTBrr (VMOVDI2PDIrr
7657                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7658                                             GR8:$src, sub_8bit))))>;
7659  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7660        (VPBROADCASTBYrr (VMOVDI2PDIrr
7661                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7662                                              GR8:$src, sub_8bit))))>;
7663
7664  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7665        (VPBROADCASTWrr (VMOVDI2PDIrr
7666                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7667                                             GR16:$src, sub_16bit))))>;
7668  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7669        (VPBROADCASTWYrr (VMOVDI2PDIrr
7670                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7671                                              GR16:$src, sub_16bit))))>;
7672
7673  def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
7674            (VPBROADCASTWrm addr:$src)>;
7675  def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
7676            (VPBROADCASTWYrm addr:$src)>;
7677
7678  def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
7679            (VPBROADCASTWrr VR128:$src)>;
7680  def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
7681            (VPBROADCASTWYrr VR128:$src)>;
7682
7683  def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
7684            (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7685  def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
7686            (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7687}
7688let Predicates = [HasAVX2, NoVLX] in {
7689  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7690            (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7691  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7692            (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7693  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7694            (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7695  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7696            (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7697}
7698
7699// AVX1 broadcast patterns
7700let Predicates = [HasAVX1Only] in {
7701def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7702          (VBROADCASTSSYrm addr:$src)>;
7703def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7704          (VBROADCASTSDYrm addr:$src)>;
7705def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7706          (VBROADCASTSSrm addr:$src)>;
7707}
7708
7709  // Provide fallback in case the load node that is used in the patterns above
7710  // is used by additional users, which prevents the pattern selection.
7711let Predicates = [HasAVX, NoVLX] in {
7712  // 128bit broadcasts:
7713  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7714            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7715  def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7716            (VMOVDDUPrm addr:$src)>;
7717
7718  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7719            (VMOVDDUPrr VR128:$src)>;
7720}
7721
7722let Predicates = [HasAVX1Only] in {
7723  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7724            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7725  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7726            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7727              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7728              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7729  def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
7730            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7731              (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
7732              (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
7733  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7734            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7735              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7736              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7737  def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
7738            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7739              (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
7740              (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
7741
7742  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7743            (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7744  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7745            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7746              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7747              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7748  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7749            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7750              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7751              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7752
7753  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7754            (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7755  def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7756            (VMOVDDUPrm addr:$src)>;
7757}
7758
7759//===----------------------------------------------------------------------===//
7760// VPERM - Permute instructions
7761//
7762
7763multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7764                     ValueType OpVT, X86FoldableSchedWrite Sched,
7765                     X86MemOperand memOp> {
7766  let Predicates = [HasAVX2, NoVLX] in {
7767    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7768                     (ins VR256:$src1, VR256:$src2),
7769                     !strconcat(OpcodeStr,
7770                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7771                     [(set VR256:$dst,
7772                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7773                     Sched<[Sched]>, VEX_4V, VEX_L;
7774    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7775                     (ins VR256:$src1, memOp:$src2),
7776                     !strconcat(OpcodeStr,
7777                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7778                     [(set VR256:$dst,
7779                       (OpVT (X86VPermv VR256:$src1,
7780                              (load addr:$src2))))]>,
7781                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7782  }
7783}
7784
7785defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7786let ExeDomain = SSEPackedSingle in
7787defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7788
7789multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7790                         ValueType OpVT, X86FoldableSchedWrite Sched,
7791                         X86MemOperand memOp> {
7792  let Predicates = [HasAVX2, NoVLX] in {
7793    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7794                       (ins VR256:$src1, u8imm:$src2),
7795                       !strconcat(OpcodeStr,
7796                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7797                       [(set VR256:$dst,
7798                         (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7799                       Sched<[Sched]>, VEX, VEX_L;
7800    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7801                       (ins memOp:$src1, u8imm:$src2),
7802                       !strconcat(OpcodeStr,
7803                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7804                       [(set VR256:$dst,
7805                         (OpVT (X86VPermi (mem_frag addr:$src1),
7806                                (i8 timm:$src2))))]>,
7807                       Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7808  }
7809}
7810
7811defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7812                            WriteShuffle256, i256mem>, VEX_W;
7813let ExeDomain = SSEPackedDouble in
7814defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7815                             WriteFShuffle256, f256mem>, VEX_W;
7816
7817//===----------------------------------------------------------------------===//
7818// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
7819//
7820let isCommutable = 1 in
7821def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7822          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7823          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7824          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7825def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7826          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7827          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7828          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7829
7830let Predicates = [HasAVX2] in {
7831  defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
7832  defm : vperm2x128_lowering<"VPERM2I128", v8i32,  loadv8i32>;
7833  defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
7834  defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
7835  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7836  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7837}
7838
7839//===----------------------------------------------------------------------===//
7840// VINSERTI128 - Insert packed integer values
7841//
7842let hasSideEffects = 0 in {
7843def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7844          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7845          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7846          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7847let mayLoad = 1 in
7848def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7849          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7850          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7851          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7852}
7853
7854let Predicates = [HasAVX2, NoVLX] in {
7855  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64,  loadv2i64,  loadv4i64>;
7856  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32,  loadv4i32,  loadv8i32>;
7857  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16,  loadv16i16>;
7858  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16,  loadv16f16>;
7859  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7860  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7861}
7862
7863//===----------------------------------------------------------------------===//
7864// VEXTRACTI128 - Extract packed integer values
7865//
7866def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7867          (ins VR256:$src1, u8imm:$src2),
7868          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7869          Sched<[WriteShuffle256]>, VEX, VEX_L;
7870let hasSideEffects = 0, mayStore = 1 in
7871def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7872          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7873          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7874          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7875
7876let Predicates = [HasAVX2, NoVLX] in {
7877  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7878  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7879  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7880  defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
7881  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7882  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7883}
7884
7885//===----------------------------------------------------------------------===//
7886// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7887//
7888multiclass avx2_pmovmask<string OpcodeStr,
7889                         Intrinsic IntLd128, Intrinsic IntLd256,
7890                         Intrinsic IntSt128, Intrinsic IntSt256,
7891                         X86SchedWriteMaskMove schedX,
7892                         X86SchedWriteMaskMove schedY> {
7893  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7894             (ins VR128:$src1, i128mem:$src2),
7895             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7896             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7897             VEX_4V, Sched<[schedX.RM]>;
7898  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7899             (ins VR256:$src1, i256mem:$src2),
7900             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7901             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7902             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7903  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7904             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7905             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7906             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7907             VEX_4V, Sched<[schedX.MR]>;
7908  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7909             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7910             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7911             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7912             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7913}
7914
7915defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7916                                int_x86_avx2_maskload_d,
7917                                int_x86_avx2_maskload_d_256,
7918                                int_x86_avx2_maskstore_d,
7919                                int_x86_avx2_maskstore_d_256,
7920                                WriteVecMaskMove32, WriteVecMaskMove32Y>;
7921defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7922                                int_x86_avx2_maskload_q,
7923                                int_x86_avx2_maskload_q_256,
7924                                int_x86_avx2_maskstore_q,
7925                                int_x86_avx2_maskstore_q_256,
7926                                WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
7927
7928multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7929                          ValueType MaskVT> {
7930    // masked store
7931    def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7932             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7933    // masked load
7934    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7935             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7936    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7937                              (VT immAllZerosV))),
7938             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7939}
7940let Predicates = [HasAVX] in {
7941  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7942  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7943  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7944  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7945}
7946let Predicates = [HasAVX1Only] in {
7947  // load/store i32/i64 not supported use ps/pd version
7948  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7949  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7950  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7951  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7952}
7953let Predicates = [HasAVX2] in {
7954  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7955  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7956  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7957  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7958}
7959
7960//===----------------------------------------------------------------------===//
7961// Variable Bit Shifts
7962//
7963multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7964                          ValueType vt128, ValueType vt256> {
7965  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7966             (ins VR128:$src1, VR128:$src2),
7967             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7968             [(set VR128:$dst,
7969               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7970             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7971  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7972             (ins VR128:$src1, i128mem:$src2),
7973             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7974             [(set VR128:$dst,
7975               (vt128 (OpNode VR128:$src1,
7976                       (vt128 (load addr:$src2)))))]>,
7977             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7978                            SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7979  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7980             (ins VR256:$src1, VR256:$src2),
7981             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7982             [(set VR256:$dst,
7983               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7984             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7985  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7986             (ins VR256:$src1, i256mem:$src2),
7987             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7988             [(set VR256:$dst,
7989               (vt256 (OpNode VR256:$src1,
7990                       (vt256 (load addr:$src2)))))]>,
7991             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7992                                   SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7993}
7994
7995let Predicates = [HasAVX2, NoVLX] in {
7996  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7997  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7998  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7999  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
8000  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
8001}
8002
8003//===----------------------------------------------------------------------===//
8004// VGATHER - GATHER Operations
8005
8006// FIXME: Improve scheduling of gather instructions.
8007multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8008                       X86MemOperand memop128, X86MemOperand memop256> {
8009let mayLoad = 1, hasSideEffects = 0 in {
8010  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8011            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8012            !strconcat(OpcodeStr,
8013              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8014            []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8015  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8016            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8017            !strconcat(OpcodeStr,
8018              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8019            []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8020}
8021}
8022
8023let Predicates = [HasAVX2] in {
8024  let mayLoad = 1, hasSideEffects = 0, Constraints
8025    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8026    in {
8027    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
8028                                  VR256, vx128mem, vx256mem>, VEX_W;
8029    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
8030                                  VR256, vx128mem, vy256mem>, VEX_W;
8031    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
8032                                  VR256, vx128mem, vy256mem>;
8033    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
8034                                  VR128, vx64mem, vy128mem>;
8035
8036    let ExeDomain = SSEPackedDouble in {
8037      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
8038                                    VR256, vx128mem, vx256mem>, VEX_W;
8039      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
8040                                    VR256, vx128mem, vy256mem>, VEX_W;
8041    }
8042
8043    let ExeDomain = SSEPackedSingle in {
8044      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
8045                                    VR256, vx128mem, vy256mem>;
8046      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
8047                                    VR128, vx64mem, vy128mem>;
8048    }
8049  }
8050}
8051
8052//===----------------------------------------------------------------------===//
8053// GFNI instructions
8054//===----------------------------------------------------------------------===//
8055
8056multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
8057                        RegisterClass RC, PatFrag MemOpFrag,
8058                        X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
8059                        bit Is2Addr = 0> {
8060  let ExeDomain = SSEPackedInt,
8061      AsmString = !if(Is2Addr,
8062        OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
8063        OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
8064    let isCommutable = 1 in
8065    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
8066                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
8067             Sched<[sched]>, T8PD;
8068
8069    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
8070                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
8071                                 (MemOpFrag addr:$src2))))]>,
8072             Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD;
8073  }
8074}
8075
8076multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
8077                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
8078                           X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
8079                           bit Is2Addr = 0> {
8080  let AsmString = !if(Is2Addr,
8081      OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8082      OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
8083  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
8084              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
8085              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
8086              SSEPackedInt>, Sched<[sched]>;
8087  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
8088              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
8089              [(set RC:$dst, (OpVT (OpNode RC:$src1,
8090                                    (MemOpFrag addr:$src2),
8091                              timm:$src3)))], SSEPackedInt>,
8092              Sched<[sched.Folded, sched.ReadAfterFold]>;
8093  }
8094}
8095
8096multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8097  let Constraints = "$src1 = $dst",
8098      Predicates  = [HasGFNI, UseSSE2] in
8099  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8100                                      VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>;
8101  let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
8102    defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
8103                                     load, i128mem, SchedWriteVecIMul.XMM>,
8104                                     VEX_4V, VEX_W;
8105    defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
8106                                     load, i256mem, SchedWriteVecIMul.YMM>,
8107                                     VEX_4V, VEX_L, VEX_W;
8108  }
8109}
8110
8111// GF2P8MULB
8112let Constraints = "$src1 = $dst",
8113    Predicates  = [HasGFNI, UseSSE2] in
8114defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8115                                    i128mem, SchedWriteVecALU.XMM, 1>;
8116let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
8117  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8118                                   i128mem, SchedWriteVecALU.XMM>, VEX_4V;
8119  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8120                                   i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L;
8121}
8122// GF2P8AFFINEINVQB, GF2P8AFFINEQB
8123let isCommutable = 0 in {
8124  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8125                                             X86GF2P8affineinvqb>, TAPD;
8126  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8127                                             X86GF2P8affineqb>, TAPD;
8128}
8129
8130// AVX-IFMA
8131let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst",
8132    checkVEXPredicate = 1 in
8133multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
8134  // NOTE: The SDNode have the multiply operands first with the add last.
8135  // This enables commuted load patterns to be autogenerated by tablegen.
8136  let isCommutable = 1 in {
8137    def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
8138               (ins VR128:$src1, VR128:$src2, VR128:$src3),
8139               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8140               [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
8141                                         VR128:$src3, VR128:$src1)))]>,
8142               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8143  }
8144    def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
8145               (ins VR128:$src1, VR128:$src2, i128mem:$src3),
8146               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8147               [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
8148                                        (loadv2i64 addr:$src3), VR128:$src1)))]>,
8149               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8150  let isCommutable = 1 in {
8151    def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
8152               (ins VR256:$src1, VR256:$src2, VR256:$src3),
8153               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8154               [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
8155                                         VR256:$src3, VR256:$src1)))]>,
8156               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8157  }
8158    def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
8159               (ins VR256:$src1, VR256:$src2, i256mem:$src3),
8160               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8161               [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
8162                                        (loadv4i64 addr:$src3), VR256:$src1)))]>,
8163               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8164}
8165
8166defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix;
8167defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix;
8168
8169// AVX-VNNI-INT8
8170let Constraints = "$src1 = $dst" in
8171multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
8172                          RegisterClass RC, PatFrag MemOpFrag,
8173                          X86MemOperand X86memop, SDNode OpNode,
8174                          X86FoldableSchedWrite Sched,
8175                          bit IsCommutable> {
8176  let isCommutable = IsCommutable in
8177  def rr  :  I<Opc, MRMSrcReg, (outs RC:$dst),
8178             (ins RC:$src1, RC:$src2, RC:$src3),
8179             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8180             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
8181             VEX_4V, Sched<[Sched]>;
8182  def rm  :  I<Opc, MRMSrcMem, (outs RC:$dst),
8183             (ins RC:$src1, RC:$src2, X86memop:$src3),
8184             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8185             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
8186                                   (MemOpFrag addr:$src3))))]>,
8187             VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
8188}
8189
8190let Predicates = [HasAVXVNNIINT8] in {
8191  defm VPDPBSSD   : avx_dotprod_rm<0x50,"vpdpbssd",  v4i32, VR128, loadv4i32,
8192                                   i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
8193                                   1>, T8XD;
8194  defm VPDPBSSDY  : avx_dotprod_rm<0x50,"vpdpbssd",  v8i32, VR256, loadv8i32,
8195                                   i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
8196                                   1>, VEX_L, T8XD;
8197  defm VPDPBUUD   : avx_dotprod_rm<0x50,"vpdpbuud",  v4i32, VR128, loadv4i32,
8198                                   i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
8199                                   1>, T8PS;
8200  defm VPDPBUUDY  : avx_dotprod_rm<0x50,"vpdpbuud",  v8i32, VR256, loadv8i32,
8201                                   i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
8202                                   1>, VEX_L, T8PS;
8203  defm VPDPBSSDS  : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
8204                                   i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
8205                                   1>, T8XD;
8206  defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
8207                                   i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
8208                                   1>, VEX_L, T8XD;
8209  defm VPDPBUUDS  : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
8210                                   i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
8211                                   1>, T8PS;
8212  defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
8213                                   i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
8214                                   1>, VEX_L, T8PS;
8215  defm VPDPBSUD   : avx_dotprod_rm<0x50,"vpdpbsud",  v4i32, VR128, loadv4i32,
8216                                   i128mem, X86vpdpbsud,  SchedWriteVecIMul.XMM,
8217                                   0>, T8XS;
8218  defm VPDPBSUDY  : avx_dotprod_rm<0x50,"vpdpbsud",  v8i32, VR256, loadv8i32,
8219                                   i256mem, X86vpdpbsud,  SchedWriteVecIMul.YMM,
8220                                   0>,  VEX_L, T8XS;
8221  defm VPDPBSUDS  : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
8222                                   i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
8223                                   0>, T8XS;
8224  defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
8225                                   i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
8226                                   0>, VEX_L, T8XS;
8227}
8228
8229// AVX-NE-CONVERT
8230multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr,
8231                  X86MemOperand MemOp128, X86MemOperand MemOp256> {
8232  def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src),
8233              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8234              [(set VR128:$dst,
8235                (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>,
8236              Sched<[WriteCvtPH2PS]>, VEX;
8237  def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src),
8238              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8239              [(set VR256:$dst,
8240                (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>,
8241              Sched<[WriteCvtPH2PSY]>, VEX, VEX_L;
8242}
8243
8244multiclass VCVTNEPS2BF16_BASE {
8245  def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8246             "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
8247             [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>,
8248             Sched<[WriteCvtPH2PS]>;
8249  def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
8250             "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}",
8251             [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>,
8252             Sched<[WriteCvtPH2PS]>;
8253  def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
8254             "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
8255             [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>,
8256             Sched<[WriteCvtPH2PSY]>, VEX_L;
8257  def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
8258             "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}",
8259             [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>,
8260             Sched<[WriteCvtPH2PSY]>, VEX_L;
8261}
8262
8263let Predicates = [HasAVXNECONVERT] in {
8264  defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem,
8265       f16mem>, T8XS;
8266  defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>,
8267       T8PD;
8268  defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem,
8269       f256mem>, T8XS;
8270  defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem,
8271       f256mem>, T8PD;
8272  defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem,
8273       f256mem>, T8XD;
8274  defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem,
8275       f256mem>, T8PS;
8276  let checkVEXPredicate = 1 in
8277  defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
8278}
8279
8280def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}",
8281                (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">;
8282def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}",
8283                (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">;
8284